createAutomatically webservice distinguishes between different modes

Caleydo · Sep 16, 2023 · 437eeb1 · 437eeb1
1 parent c8a121a
commit 437eeb1
Show file tree

Hide file tree

Showing 7 changed files with 174 additions and 110 deletions.
diff --git a/coral/sql.py b/coral/sql.py
@@ -1018,8 +1018,10 @@ def create_automatically():
   For the {route} query the following parameter is needed:
   - cohortId: id of the cohort
   There are also  optional parameters:
+  - attribute0: one column of the entity table, used when called for 2 attributes
+  - attribute0type: type of the attribute0
   - attribute1: one column of the entity table, used when called for 2 attributes
-  - attribute2: one column of the entity table, used when called for 2 attributes
+  - attribute1type: type of the attribute1
   - attribute: one column of the entity table""".format(
     route="cohortData"
   )
@@ -1028,92 +1030,103 @@ def create_automatically():
 
   # based on createUseNumFilter and create cohorts with hdbscan clustering
   try:
+    # different implementations based on different parameters
+    # if there is just one (numerical) attribute, use hdbscan
+    # if there are two (numerical) attributes, use hdbscan
+    # if there is one categorical and one numerical attribute, use k-prototypes
+    HDBSCAN = "hdbscan"
+    K_PROTOTYPES = "k-prototypes"
+    cluster_method = None
+
     query = QueryElements()
     cohort = query.get_cohort_from_db(request.values, error_msg)  # get parent cohort
-    sql_text = query.get_cohort_data_sql(request.values, cohort)  # get sql statement to retrieve data
-    query_results = query.execute_sql_query(sql_text, cohort.entity_database)
-    # _log.debug("query_results %s ", query_results.get_json())
 
-    # if there are 2 attributes, use both, if there is just 1 attribute, use one
-    if 'attribute1' in request.values:
-        attribute1 = request.values['attribute1']
-        attribute2 = request.values['attribute2']
-        tissues = [item for item in query_results.get_json() if item[attribute1] is not None and item[attribute2] is not None]
+    if "attribute0" in request.values and not "attribute1" in request.values:
+        # one (numerical) attriubte
+        cluster_method = HDBSCAN
+        sql_text = query.get_cohort_data_sql({"attribute": request.values["attribute0"]}, cohort)  # get sql statement to retrieve data
+        attribute0 = {"dataKey": request.values["attribute0"], "type": request.values["attribute0type"]}
+        query_results = query.execute_sql_query(sql_text, cohort.entity_database)
+        tissues = [item for item in query_results.get_json() if item[attribute0["dataKey"]] is not None]
         tissues_df = pd.DataFrame(tissues)
-        # get only the values of the attributes
-        tissues_attribute_df = tissues_df[[attribute1, attribute2]].values
-    else:
-        attribute = request.values['attribute']
-        tissues = [item for item in query_results.get_json() if item[attribute] is not None]
-        tissues_df = pd.DataFrame(tissues)
-        tissues_attribute_df = tissues_df[attribute].values.reshape(-1, 1)
-
+        tissues_attribute_df = tissues_df[attribute0["dataKey"]].values.reshape(-1, 1)
+    elif "attribute0" in request.values and "attribute1" in request.values:
+        # two attributes
+        if request.values["attribute0type"] == "number" and request.values["attribute1type"] == "number":
+            # two numerical attributes
+            cluster_method = HDBSCAN
+            sql_text = query.get_cohort_data_multi_attr_sql(request.values, cohort)  # get sql statement to retrieve data
+            query_results = query.execute_sql_query(sql_text, cohort.entity_database)
+            attribute0 = {"dataKey": request.values["attribute0"], "type": request.values["attribute0type"]}
+            attribute1 = {"dataKey": request.values["attribute1"], "type": request.values["attribute1type"]}
+            tissues = [item for item in query_results.get_json() if item[attribute0["dataKey"]] is not None and item[attribute1["dataKey"]] is not None]
+            tissues_df = pd.DataFrame(tissues)
+            tissues_attribute_df = tissues_df[[attribute0["dataKey"], attribute1["dataKey"]]].values
+        elif request.values["attribute0type"] == "categorical" and request.values["attribute1type"] == "number" or request.values["attribute0type"] == "number" and request.values["attribute1type"] == "categorical":
+            cluster_method = K_PROTOTYPES
+
 
-    # hdbscan
-    clusterer = hdbscan.HDBSCAN(min_cluster_size=round(tissues_attribute_df.shape[0]/50), gen_min_span_tree=True) # one tenth of the number of tissues, to get a reasonable amount of clusters
-    clusterer.fit(tissues_attribute_df)
-    # get the labels of the clusters
-    labels = clusterer.labels_
-    _log.debug("labels %s", labels)
-    # get the number of clusters by getting the distinct values of labels
-    n_clusters_ = len(set(labels))
-    _log.debug("n_clusters_ %s", n_clusters_)
 
-    # create a cohort for each cluster
-    # add the cluster labels to the tissues
-    tissues_df['cluster_label'] = labels
-    _log.debug("tissues_df %s", tissues_df)
+    # fit the clusterer based on the attribute values
 
-    # get all the tissuenames for each cluster
-    _log.debug("tissues[0] with label %s", tissues[0])
-    _log.debug("tissues_df[tissues_df['cluster_label'] == 0] %s", tissues_df[tissues_df['cluster_label'] == 0])
-    # loop over every cluster
-    # get the distinct values of the cluster_label
-    _log.debug("set(labels) %s", set(labels))
-
-    # clusters_tissuenames = {}
-    # for i in set(labels):
-    #   clusters_tissuenames[i] = (tissues_df[tissues_df['cluster_label'] == i]['tissuename'].tolist())
-
-    # _log.debug("clusters_tissuenames %s", clusters_tissuenames)
-    # _log.debug("clusters_tissuenames[-1] %s", clusters_tissuenames[-1])
-    # _log.debug("clusters_tissuenames[0] %s", clusters_tissuenames[0])
-    # _log.debug("clusters_tissuenames[1] %s", clusters_tissuenames[1])
-    # _log.debug("clusters_tissuenames[1] %s", clusters_tissuenames[1]['tissuename'])
-
-
-
-    # create a cohort for each cluster
-    cohortids = []
-    for i in set(labels):
-      clusters_tissuenames = tissues_df[tissues_df['cluster_label'] == i]['tissuename'].tolist()
-      _log.debug("cohortdebuggg %s", cohort)
-      # change the statement to use  the ids of the cohorts
-      # Convert the list into a comma-separated string
-      sql_values = "(" + ", ".join(["'" + item + "'" for item in clusters_tissuenames]) + ")"
-      sql_text = 'SELECT p.* FROM (SELECT * FROM tissue.tdp_tissue) p WHERE (p.tissuename IN {tissuenames})'.format(tissuenames=(sql_values)) # TODO: make this generic for other table, multiple attributes etc
-      # _log.debug("sql_text %s", sql_text)
-      cohort.statement = sql_text
-      # _log.debug("cohortdebuggg %s", cohort)
-
-      new_cohort = query.create_cohort_automatically_from_tissue_names(request.values, cohort,
-                                                    error_msg)  # get filtered cohort from args and cohort
-      _log.debug("new_cohort %s", new_cohort)
-      returnvalue = query.add_cohort_to_db(new_cohort).data # save new cohort into DB
-      # Convert bytes to integers and remove brackets
-      returnvalue = int(returnvalue.decode("utf-8").strip('[]\n')) # this is a workaround to undo the jsonify that is done in add_cohort_to_db
-      cohortids.append(returnvalue)
-      _log.debug("cohortids now %s", cohortids)
-
-      # hdbscan end
+    # 1 or 2 numerical attributes ==> hdbscan
+    if cluster_method == HDBSCAN:
+        clusterer = hdbscan.HDBSCAN(min_cluster_size=round(tissues_attribute_df.shape[0]/50), gen_min_span_tree=True) # one tenth of the number of tissues, to get a reasonable amount of clusters
+        # TODO: how to find a useful min_cluster_size? also: return useful error message if this gets too small somehow
+        clusterer.fit(tissues_attribute_df)
+        # get the labels of the clusters
+        labels = clusterer.labels_
+        _log.debug("labels %s", labels)
+        # get the number of clusters by getting the distinct values of labels
+        n_clusters_ = len(set(labels))
+        _log.debug("n_clusters_ %s", n_clusters_)
+
+        # create a cohort for each cluster
+        # add the cluster labels to the tissues
+        tissues_df["cluster_label"] = labels
+        _log.debug("tissues_df %s", tissues_df)
+
+        # get all the tissuenames for each cluster
+        _log.debug("tissues[0] with label %s", tissues[0])
+        _log.debug("tissues_df[tissues_df['cluster_label'] == 0] %s", tissues_df[tissues_df["cluster_label"] == 0])
+        # loop over every cluster
+        # get the distinct values of the cluster_label
+        _log.debug("set(labels) %s", set(labels))
+
+        # create a cohort for each cluster
+        cohortids = []
+        for i in set(labels):
+        clusters_tissuenames = tissues_df[tissues_df["cluster_label"] == i]["tissuename"].tolist()
+        _log.debug("cohortdebuggg %s", cohort)
+        # change the statement to use  the ids of the cohorts
+        # Convert the list into a comma-separated string
+        sql_values = "(" + ", ".join(["'" + item + "'" for item in clusters_tissuenames]) + ")"
+        sql_text = "SELECT p.* FROM (SELECT * FROM tissue.tdp_tissue) p WHERE (p.tissuename IN {tissuenames})".format(tissuenames=(sql_values)) # TODO: make this generic for other table, multiple attributes etc
+        # _log.debug("sql_text %s", sql_text)
+        cohort.statement = sql_text
+        # _log.debug("cohortdebuggg %s", cohort)
+
+        new_cohort = query.create_cohort_automatically_from_tissue_names(request.values, cohort,
+                                                        error_msg)  # get filtered cohort from args and cohort
+        _log.debug("new_cohort %s", new_cohort)
+        returnvalue = query.add_cohort_to_db(new_cohort).data # save new cohort into DB
+        # Convert bytes to integers and remove brackets
+        returnvalue = int(returnvalue.decode("utf-8").strip("[]\n")) # this is a workaround to undo the jsonify that is done in add_cohort_to_db
+        cohortids.append(returnvalue)
+        _log.debug("cohortids now %s", cohortids)
+
+        # hdbscan end
+    elif cluster_method == K_PROTOTYPES:
+        # todo: implement
+        # 1 categorical and 1 numerical attribute ==> k-prototypes
+        return "not implemented yet"
     _log.debug("cohortids %s", cohortids)
     return jsonify(cohortids)
-
-
   except RuntimeError as error:
     abort(400, error)
 
 
+
 def create():
     """
     entry point of this plugin

diff --git a/coral/sql_query_mapper.py b/coral/sql_query_mapper.py
@@ -661,15 +661,6 @@ def create_cohort_automatically_from_tissue_names(self, args, cohort, error_msg)
         if name is None:
             raise RuntimeError(error_msg)
 
-        attribute = args.get("attribute")
-        if attribute is None:
-            attribute1 = args.get("attribute1")
-            attribute2 = args.get("attribute2")
-            if attribute1 is None or attribute2 is None:
-                raise RuntimeError(error_msg)
-
-
-
         sql_text = "SELECT p.* FROM ({entities}) p".format(entities=cohort.statement)
         _log.debug("sql_text_create_cohort_automatically_from_tissue_names: %s", sql_text)
 
@@ -836,6 +827,32 @@ def get_cohort_data_sql(self, args, cohort):
             )
 
         return sql_text
+
+    def get_cohort_data_multi_attr_sql(self, args, cohort):
+        attribute0 = args.get("attribute0")
+        attribute1 = args.get("attribute1")
+
+        entity_id_col = ""
+        if cohort.entity_table == "tdp_tissue":
+            entity_id_col = "tissuename"
+        elif cohort.entity_table == "tdp_tissue_2":
+            entity_id_col = "tissuename"
+        elif cohort.entity_table == "tdp_cellline":
+            entity_id_col = "celllinename"
+        elif cohort.entity_table == "student_view_anonym":
+            entity_id_col = "id"
+        elif cohort.entity_table == "korea":
+            entity_id_col = "id"
+
+        # define statement
+        sql_text = cohort.statement  # all attributes
+        if attribute0 is not None and attribute1 is not None:
+            # only one attribute
+            sql_text = "SELECT p.{entity_id_col}, p.{attribute0}, p.{attribute1} FROM ({entities}) p".format(
+                entity_id_col=entity_id_col, attribute0=attribute0, attribute1=attribute1, entities=cohort.statement
+            )
+
+        return sql_text
 
     def get_cohort_size_sql(self, cohort):
         sql_text = "SELECT COUNT(p.*) as size FROM ({entities}) p".format(entities=cohort.statement)

diff --git a/src/Taskview/visualizations/AVegaVisualization.ts b/src/Taskview/visualizations/AVegaVisualization.ts
@@ -15,7 +15,7 @@ import { IVisualization } from './IVisualization';
 import { IAttribute, IdValuePair } from '../../data/IAttribute';
 import {
   ICohortDBDataParams,
-  ICohortDBWithNumFilterParams,
+  ICohortDBWithNumFilterParams, ICohortMultiAttrDBDataParams,
   IEqualsList,
   INumRange,
   NumRangeOperators
@@ -728,9 +728,10 @@ export abstract class SingleAttributeVisualization extends AVegaVisualization {
     let newCohortIds = [];
     if (bins.length === 1) { // TODO: what about more than one?
       let cohort = bins[0].cohort;
-      const params: ICohortDBDataParams = {
+      const params: ICohortMultiAttrDBDataParams = {
         cohortId: cohort.dbId,
-        attribute: this.attribute.dataKey
+        attribute0: this.attribute.dataKey,
+        attribute0type: this.attribute.type
       };
       newCohortIds = await createDBCohortAutomatically(params)
       console.log("createAutomatically data", newCohortIds);
@@ -806,7 +807,6 @@ export abstract class SingleAttributeVisualization extends AVegaVisualization {
         <div role="tabpanel" class="tab-pane" id="split">
           <div class="flex-wrapper" data-attr="${this.attribute.dataKey}">
           <button type="button" class="btn recommendSplitBtn btn-coral-prime" title="Calculate meaningful splits.">Recommend split</button>
-          <button type="button" class="btn createAutomaticallyBtn btn-coral-prime" title="Calculate meaningful splits.">Create cohorts automatically</button>
             <label>Split into</label>
             <input type="number" class="bins" step="any" min="1" max="99" value="2"/>
             <label >bins of</label>
@@ -824,6 +824,7 @@ export abstract class SingleAttributeVisualization extends AVegaVisualization {
     </div>
     <div class="d-grid gap-2">
       <button type="button" class="btn applyBtn btn-coral-prime" title="Apply to get a preview of the output cohorts.">Apply</button>
+      <button type="button" class="btn createAutomaticallyBtn btn-coral-prime" title="Calculate meaningful splits.">Create cohorts automatically</button>
     </div>
     `,
     );

diff --git a/src/Taskview/visualizations/GroupedBoxplot.ts b/src/Taskview/visualizations/GroupedBoxplot.ts
@@ -4,13 +4,13 @@ import log from 'loglevel';
 import { Spec as VegaSpec } from 'vega';
 import { TopLevelSpec as VegaLiteSpec } from 'vega-lite';
 import { ICohort } from '../../app/interfaces';
-import { IAttributeFilter, IFilterDesc } from '../../util';
-import { FilterEvent, SplitEvent } from '../../base/events';
+import {IAttributeFilter, IFilterDesc, INewCohortDesc} from '../../util';
+import {AutoSplitEvent, FilterEvent, SplitEvent} from '../../base/events';
 import { AVegaVisualization } from './AVegaVisualization';
 import { groupByConfig } from './config/GroupConfig';
 import { BRUSH_DATA_END, BRUSH_DATA_NAME, BRUSH_DATA_START, DATA_LABEL } from './constants';
 import { MultiAttributeVisualization } from './MultiAttributeVisualization';
-import { NumRangeOperators } from '../../base';
+import {createDBCohortAutomatically, ICohortMultiAttrDBDataParams, NumRangeOperators} from '../../base';
 import { IAttribute, IdValuePair } from '../../data';
 
 export class GroupedBoxplot extends MultiAttributeVisualization {
@@ -242,6 +242,39 @@ export class GroupedBoxplot extends MultiAttributeVisualization {
     this.container.dispatchEvent(new SplitEvent(filterDescs));
   }
 
+  async createAutomatically() {
+    console.log("createAutomatically GroupedBoxplot");
+
+    let newCohortIds = [];
+    for (const cht of this.cohorts) {
+      const params: ICohortMultiAttrDBDataParams = {
+        cohortId: cht.dbId,
+        attribute0: this.attributes[0].dataKey,
+        attribute0type: this.attributes[0].type,
+        attribute1: this.attributes[1].dataKey,
+        attribute1type: this.attributes[1].type
+      };
+      newCohortIds = await createDBCohortAutomatically(params)
+      console.log("createAutomatically scatterplot data", newCohortIds);
+    }
+
+    let cohortDescs: INewCohortDesc[];
+    cohortDescs = [];
+    // for every selected cohort
+    for (const cohort of this.cohorts) {
+      // for every newCohort create a filter (for now... the filter is actually not needed, will be changed in the future)
+      for (const newCohort of newCohortIds){
+        cohortDescs.push({
+          cohort: cohort,
+          newCohortId: newCohort,
+          attr:[this.attributes[0], this.attributes[1]]
+        });
+      }
+    }
+
+    this.container.dispatchEvent(new AutoSplitEvent(cohortDescs));
+  }
+
   getSpec(data: IdValuePair[]): VegaLiteSpec {
     this.catAttribute = this.attributes.find((attr) => ['categorical', 'string'].includes(attr.type));
     this.numAttribute = this.attributes.find((attr) => attr.type === `number`);

diff --git a/src/Taskview/visualizations/Scatterplot.ts b/src/Taskview/visualizations/Scatterplot.ts
@@ -881,12 +881,16 @@ export class Scatterplot extends MultiAttributeVisualization {
   async createAutomatically() {
     console.log("createAutomatically scatterplot");
 
+    // AttributeType = 'categorical' | 'number' | 'string'; TODO send it with the data
+
     let newCohortIds = [];
     for (const cht of this.cohorts) {
       const params: ICohortMultiAttrDBDataParams = {
         cohortId: cht.dbId,
-        attribute1: "age",
-        attribute2: "bmi"
+        attribute0: this.attributes[0].dataKey,
+        attribute0type: this.attributes[0].type,
+        attribute1: this.attributes[1].dataKey,
+        attribute1type: this.attributes[1].type,
       };
       newCohortIds = await createDBCohortAutomatically(params)
       console.log("createAutomatically scatterplot data", newCohortIds);

diff --git a/src/base/interfaces.ts b/src/base/interfaces.ts
@@ -90,8 +90,10 @@ export interface ICohortDBDataParams extends IParams {
 
 export interface ICohortMultiAttrDBDataParams extends IParams {
   cohortId: number;
+  attribute0?: string;
+  attribute0type?: string;
   attribute1?: string;
-  attribute2?: string;
+  attribute1type?: string;
 }
 
 export interface ICohortDBSizeParams extends IParams {