Skip to content

Commit

Permalink
createAutomatically webservice distinguishes between different modes
Browse files Browse the repository at this point in the history
  • Loading branch information
richardwolfmayr committed Sep 16, 2023
1 parent c8a121a commit 437eeb1
Show file tree
Hide file tree
Showing 7 changed files with 174 additions and 110 deletions.
161 changes: 87 additions & 74 deletions coral/sql.py
Original file line number Diff line number Diff line change
Expand Up @@ -1018,8 +1018,10 @@ def create_automatically():
For the {route} query the following parameter is needed:
- cohortId: id of the cohort
There are also optional parameters:
- attribute0: one column of the entity table, used when called for 2 attributes
- attribute0type: type of the attribute0
- attribute1: one column of the entity table, used when called for 2 attributes
- attribute2: one column of the entity table, used when called for 2 attributes
- attribute1type: type of the attribute1
- attribute: one column of the entity table""".format(
route="cohortData"
)
Expand All @@ -1028,92 +1030,103 @@ def create_automatically():

# based on createUseNumFilter and create cohorts with hdbscan clustering
try:
# different implementations based on different parameters
# if there is just one (numerical) attribute, use hdbscan
# if there are two (numerical) attributes, use hdbscan
# if there is one categorical and one numerical attribute, use k-prototypes
HDBSCAN = "hdbscan"
K_PROTOTYPES = "k-prototypes"
cluster_method = None

query = QueryElements()
cohort = query.get_cohort_from_db(request.values, error_msg) # get parent cohort
sql_text = query.get_cohort_data_sql(request.values, cohort) # get sql statement to retrieve data
query_results = query.execute_sql_query(sql_text, cohort.entity_database)
# _log.debug("query_results %s ", query_results.get_json())

# if there are 2 attributes, use both, if there is just 1 attribute, use one
if 'attribute1' in request.values:
attribute1 = request.values['attribute1']
attribute2 = request.values['attribute2']
tissues = [item for item in query_results.get_json() if item[attribute1] is not None and item[attribute2] is not None]
if "attribute0" in request.values and not "attribute1" in request.values:
# one (numerical) attriubte
cluster_method = HDBSCAN
sql_text = query.get_cohort_data_sql({"attribute": request.values["attribute0"]}, cohort) # get sql statement to retrieve data
attribute0 = {"dataKey": request.values["attribute0"], "type": request.values["attribute0type"]}
query_results = query.execute_sql_query(sql_text, cohort.entity_database)
tissues = [item for item in query_results.get_json() if item[attribute0["dataKey"]] is not None]
tissues_df = pd.DataFrame(tissues)
# get only the values of the attributes
tissues_attribute_df = tissues_df[[attribute1, attribute2]].values
else:
attribute = request.values['attribute']
tissues = [item for item in query_results.get_json() if item[attribute] is not None]
tissues_df = pd.DataFrame(tissues)
tissues_attribute_df = tissues_df[attribute].values.reshape(-1, 1)

tissues_attribute_df = tissues_df[attribute0["dataKey"]].values.reshape(-1, 1)
elif "attribute0" in request.values and "attribute1" in request.values:
# two attributes
if request.values["attribute0type"] == "number" and request.values["attribute1type"] == "number":
# two numerical attributes
cluster_method = HDBSCAN
sql_text = query.get_cohort_data_multi_attr_sql(request.values, cohort) # get sql statement to retrieve data
query_results = query.execute_sql_query(sql_text, cohort.entity_database)
attribute0 = {"dataKey": request.values["attribute0"], "type": request.values["attribute0type"]}
attribute1 = {"dataKey": request.values["attribute1"], "type": request.values["attribute1type"]}
tissues = [item for item in query_results.get_json() if item[attribute0["dataKey"]] is not None and item[attribute1["dataKey"]] is not None]
tissues_df = pd.DataFrame(tissues)
tissues_attribute_df = tissues_df[[attribute0["dataKey"], attribute1["dataKey"]]].values
elif request.values["attribute0type"] == "categorical" and request.values["attribute1type"] == "number" or request.values["attribute0type"] == "number" and request.values["attribute1type"] == "categorical":
cluster_method = K_PROTOTYPES


# hdbscan
clusterer = hdbscan.HDBSCAN(min_cluster_size=round(tissues_attribute_df.shape[0]/50), gen_min_span_tree=True) # one tenth of the number of tissues, to get a reasonable amount of clusters
clusterer.fit(tissues_attribute_df)
# get the labels of the clusters
labels = clusterer.labels_
_log.debug("labels %s", labels)
# get the number of clusters by getting the distinct values of labels
n_clusters_ = len(set(labels))
_log.debug("n_clusters_ %s", n_clusters_)

# create a cohort for each cluster
# add the cluster labels to the tissues
tissues_df['cluster_label'] = labels
_log.debug("tissues_df %s", tissues_df)
# fit the clusterer based on the attribute values

# get all the tissuenames for each cluster
_log.debug("tissues[0] with label %s", tissues[0])
_log.debug("tissues_df[tissues_df['cluster_label'] == 0] %s", tissues_df[tissues_df['cluster_label'] == 0])
# loop over every cluster
# get the distinct values of the cluster_label
_log.debug("set(labels) %s", set(labels))

# clusters_tissuenames = {}
# for i in set(labels):
# clusters_tissuenames[i] = (tissues_df[tissues_df['cluster_label'] == i]['tissuename'].tolist())

# _log.debug("clusters_tissuenames %s", clusters_tissuenames)
# _log.debug("clusters_tissuenames[-1] %s", clusters_tissuenames[-1])
# _log.debug("clusters_tissuenames[0] %s", clusters_tissuenames[0])
# _log.debug("clusters_tissuenames[1] %s", clusters_tissuenames[1])
# _log.debug("clusters_tissuenames[1] %s", clusters_tissuenames[1]['tissuename'])



# create a cohort for each cluster
cohortids = []
for i in set(labels):
clusters_tissuenames = tissues_df[tissues_df['cluster_label'] == i]['tissuename'].tolist()
_log.debug("cohortdebuggg %s", cohort)
# change the statement to use the ids of the cohorts
# Convert the list into a comma-separated string
sql_values = "(" + ", ".join(["'" + item + "'" for item in clusters_tissuenames]) + ")"
sql_text = 'SELECT p.* FROM (SELECT * FROM tissue.tdp_tissue) p WHERE (p.tissuename IN {tissuenames})'.format(tissuenames=(sql_values)) # TODO: make this generic for other table, multiple attributes etc
# _log.debug("sql_text %s", sql_text)
cohort.statement = sql_text
# _log.debug("cohortdebuggg %s", cohort)

new_cohort = query.create_cohort_automatically_from_tissue_names(request.values, cohort,
error_msg) # get filtered cohort from args and cohort
_log.debug("new_cohort %s", new_cohort)
returnvalue = query.add_cohort_to_db(new_cohort).data # save new cohort into DB
# Convert bytes to integers and remove brackets
returnvalue = int(returnvalue.decode("utf-8").strip('[]\n')) # this is a workaround to undo the jsonify that is done in add_cohort_to_db
cohortids.append(returnvalue)
_log.debug("cohortids now %s", cohortids)

# hdbscan end
# 1 or 2 numerical attributes ==> hdbscan
if cluster_method == HDBSCAN:
clusterer = hdbscan.HDBSCAN(min_cluster_size=round(tissues_attribute_df.shape[0]/50), gen_min_span_tree=True) # one tenth of the number of tissues, to get a reasonable amount of clusters
# TODO: how to find a useful min_cluster_size? also: return useful error message if this gets too small somehow
clusterer.fit(tissues_attribute_df)
# get the labels of the clusters
labels = clusterer.labels_
_log.debug("labels %s", labels)
# get the number of clusters by getting the distinct values of labels
n_clusters_ = len(set(labels))
_log.debug("n_clusters_ %s", n_clusters_)

# create a cohort for each cluster
# add the cluster labels to the tissues
tissues_df["cluster_label"] = labels
_log.debug("tissues_df %s", tissues_df)

# get all the tissuenames for each cluster
_log.debug("tissues[0] with label %s", tissues[0])
_log.debug("tissues_df[tissues_df['cluster_label'] == 0] %s", tissues_df[tissues_df["cluster_label"] == 0])
# loop over every cluster
# get the distinct values of the cluster_label
_log.debug("set(labels) %s", set(labels))

# create a cohort for each cluster
cohortids = []
for i in set(labels):
clusters_tissuenames = tissues_df[tissues_df["cluster_label"] == i]["tissuename"].tolist()
_log.debug("cohortdebuggg %s", cohort)
# change the statement to use the ids of the cohorts
# Convert the list into a comma-separated string
sql_values = "(" + ", ".join(["'" + item + "'" for item in clusters_tissuenames]) + ")"
sql_text = "SELECT p.* FROM (SELECT * FROM tissue.tdp_tissue) p WHERE (p.tissuename IN {tissuenames})".format(tissuenames=(sql_values)) # TODO: make this generic for other table, multiple attributes etc
# _log.debug("sql_text %s", sql_text)
cohort.statement = sql_text
# _log.debug("cohortdebuggg %s", cohort)

new_cohort = query.create_cohort_automatically_from_tissue_names(request.values, cohort,
error_msg) # get filtered cohort from args and cohort
_log.debug("new_cohort %s", new_cohort)
returnvalue = query.add_cohort_to_db(new_cohort).data # save new cohort into DB
# Convert bytes to integers and remove brackets
returnvalue = int(returnvalue.decode("utf-8").strip("[]\n")) # this is a workaround to undo the jsonify that is done in add_cohort_to_db
cohortids.append(returnvalue)
_log.debug("cohortids now %s", cohortids)

# hdbscan end
elif cluster_method == K_PROTOTYPES:
# todo: implement
# 1 categorical and 1 numerical attribute ==> k-prototypes
return "not implemented yet"
_log.debug("cohortids %s", cohortids)
return jsonify(cohortids)


except RuntimeError as error:
abort(400, error)



def create():
"""
entry point of this plugin
Expand Down
35 changes: 26 additions & 9 deletions coral/sql_query_mapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -661,15 +661,6 @@ def create_cohort_automatically_from_tissue_names(self, args, cohort, error_msg)
if name is None:
raise RuntimeError(error_msg)

attribute = args.get("attribute")
if attribute is None:
attribute1 = args.get("attribute1")
attribute2 = args.get("attribute2")
if attribute1 is None or attribute2 is None:
raise RuntimeError(error_msg)



sql_text = "SELECT p.* FROM ({entities}) p".format(entities=cohort.statement)
_log.debug("sql_text_create_cohort_automatically_from_tissue_names: %s", sql_text)

Expand Down Expand Up @@ -836,6 +827,32 @@ def get_cohort_data_sql(self, args, cohort):
)

return sql_text

def get_cohort_data_multi_attr_sql(self, args, cohort):
attribute0 = args.get("attribute0")
attribute1 = args.get("attribute1")

entity_id_col = ""
if cohort.entity_table == "tdp_tissue":
entity_id_col = "tissuename"
elif cohort.entity_table == "tdp_tissue_2":
entity_id_col = "tissuename"
elif cohort.entity_table == "tdp_cellline":
entity_id_col = "celllinename"
elif cohort.entity_table == "student_view_anonym":
entity_id_col = "id"
elif cohort.entity_table == "korea":
entity_id_col = "id"

# define statement
sql_text = cohort.statement # all attributes
if attribute0 is not None and attribute1 is not None:
# only one attribute
sql_text = "SELECT p.{entity_id_col}, p.{attribute0}, p.{attribute1} FROM ({entities}) p".format(
entity_id_col=entity_id_col, attribute0=attribute0, attribute1=attribute1, entities=cohort.statement
)

return sql_text

def get_cohort_size_sql(self, cohort):
sql_text = "SELECT COUNT(p.*) as size FROM ({entities}) p".format(entities=cohort.statement)
Expand Down
9 changes: 5 additions & 4 deletions src/Taskview/visualizations/AVegaVisualization.ts
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ import { IVisualization } from './IVisualization';
import { IAttribute, IdValuePair } from '../../data/IAttribute';
import {
ICohortDBDataParams,
ICohortDBWithNumFilterParams,
ICohortDBWithNumFilterParams, ICohortMultiAttrDBDataParams,
IEqualsList,
INumRange,
NumRangeOperators
Expand Down Expand Up @@ -728,9 +728,10 @@ export abstract class SingleAttributeVisualization extends AVegaVisualization {
let newCohortIds = [];
if (bins.length === 1) { // TODO: what about more than one?
let cohort = bins[0].cohort;
const params: ICohortDBDataParams = {
const params: ICohortMultiAttrDBDataParams = {
cohortId: cohort.dbId,
attribute: this.attribute.dataKey
attribute0: this.attribute.dataKey,
attribute0type: this.attribute.type
};
newCohortIds = await createDBCohortAutomatically(params)
console.log("createAutomatically data", newCohortIds);
Expand Down Expand Up @@ -806,7 +807,6 @@ export abstract class SingleAttributeVisualization extends AVegaVisualization {
<div role="tabpanel" class="tab-pane" id="split">
<div class="flex-wrapper" data-attr="${this.attribute.dataKey}">
<button type="button" class="btn recommendSplitBtn btn-coral-prime" title="Calculate meaningful splits.">Recommend split</button>
<button type="button" class="btn createAutomaticallyBtn btn-coral-prime" title="Calculate meaningful splits.">Create cohorts automatically</button>
<label>Split into</label>
<input type="number" class="bins" step="any" min="1" max="99" value="2"/>
<label >bins of</label>
Expand All @@ -824,6 +824,7 @@ export abstract class SingleAttributeVisualization extends AVegaVisualization {
</div>
<div class="d-grid gap-2">
<button type="button" class="btn applyBtn btn-coral-prime" title="Apply to get a preview of the output cohorts.">Apply</button>
<button type="button" class="btn createAutomaticallyBtn btn-coral-prime" title="Calculate meaningful splits.">Create cohorts automatically</button>
</div>
`,
);
Expand Down
39 changes: 36 additions & 3 deletions src/Taskview/visualizations/GroupedBoxplot.ts
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,13 @@ import log from 'loglevel';
import { Spec as VegaSpec } from 'vega';
import { TopLevelSpec as VegaLiteSpec } from 'vega-lite';
import { ICohort } from '../../app/interfaces';
import { IAttributeFilter, IFilterDesc } from '../../util';
import { FilterEvent, SplitEvent } from '../../base/events';
import {IAttributeFilter, IFilterDesc, INewCohortDesc} from '../../util';
import {AutoSplitEvent, FilterEvent, SplitEvent} from '../../base/events';
import { AVegaVisualization } from './AVegaVisualization';
import { groupByConfig } from './config/GroupConfig';
import { BRUSH_DATA_END, BRUSH_DATA_NAME, BRUSH_DATA_START, DATA_LABEL } from './constants';
import { MultiAttributeVisualization } from './MultiAttributeVisualization';
import { NumRangeOperators } from '../../base';
import {createDBCohortAutomatically, ICohortMultiAttrDBDataParams, NumRangeOperators} from '../../base';
import { IAttribute, IdValuePair } from '../../data';

export class GroupedBoxplot extends MultiAttributeVisualization {
Expand Down Expand Up @@ -242,6 +242,39 @@ export class GroupedBoxplot extends MultiAttributeVisualization {
this.container.dispatchEvent(new SplitEvent(filterDescs));
}

async createAutomatically() {
console.log("createAutomatically GroupedBoxplot");

let newCohortIds = [];
for (const cht of this.cohorts) {
const params: ICohortMultiAttrDBDataParams = {
cohortId: cht.dbId,
attribute0: this.attributes[0].dataKey,
attribute0type: this.attributes[0].type,
attribute1: this.attributes[1].dataKey,
attribute1type: this.attributes[1].type
};
newCohortIds = await createDBCohortAutomatically(params)
console.log("createAutomatically scatterplot data", newCohortIds);
}

let cohortDescs: INewCohortDesc[];
cohortDescs = [];
// for every selected cohort
for (const cohort of this.cohorts) {
// for every newCohort create a filter (for now... the filter is actually not needed, will be changed in the future)
for (const newCohort of newCohortIds){
cohortDescs.push({
cohort: cohort,
newCohortId: newCohort,
attr:[this.attributes[0], this.attributes[1]]
});
}
}

this.container.dispatchEvent(new AutoSplitEvent(cohortDescs));
}

getSpec(data: IdValuePair[]): VegaLiteSpec {
this.catAttribute = this.attributes.find((attr) => ['categorical', 'string'].includes(attr.type));
this.numAttribute = this.attributes.find((attr) => attr.type === `number`);
Expand Down
8 changes: 6 additions & 2 deletions src/Taskview/visualizations/Scatterplot.ts
Original file line number Diff line number Diff line change
Expand Up @@ -881,12 +881,16 @@ export class Scatterplot extends MultiAttributeVisualization {
async createAutomatically() {
console.log("createAutomatically scatterplot");

// AttributeType = 'categorical' | 'number' | 'string'; TODO send it with the data

let newCohortIds = [];
for (const cht of this.cohorts) {
const params: ICohortMultiAttrDBDataParams = {
cohortId: cht.dbId,
attribute1: "age",
attribute2: "bmi"
attribute0: this.attributes[0].dataKey,
attribute0type: this.attributes[0].type,
attribute1: this.attributes[1].dataKey,
attribute1type: this.attributes[1].type,
};
newCohortIds = await createDBCohortAutomatically(params)
console.log("createAutomatically scatterplot data", newCohortIds);
Expand Down
4 changes: 3 additions & 1 deletion src/base/interfaces.ts
Original file line number Diff line number Diff line change
Expand Up @@ -90,8 +90,10 @@ export interface ICohortDBDataParams extends IParams {

export interface ICohortMultiAttrDBDataParams extends IParams {
cohortId: number;
attribute0?: string;
attribute0type?: string;
attribute1?: string;
attribute2?: string;
attribute1type?: string;
}

export interface ICohortDBSizeParams extends IParams {
Expand Down
Loading

0 comments on commit 437eeb1

Please sign in to comment.