Skip to content

Commit

Permalink
Added documentation and chdir when calling for the second time
Browse files Browse the repository at this point in the history
  • Loading branch information
RobbinBouwmeester committed Jan 15, 2020
1 parent f7c8de5 commit e1b02e2
Show file tree
Hide file tree
Showing 12 changed files with 75,367 additions and 17 deletions.
206 changes: 206 additions & 0 deletions example_data/aicheler_data.csv

Large diffs are not rendered by default.

206 changes: 206 additions & 0 deletions example_data/aicheler_data_features.csv

Large diffs are not rendered by default.

37,191 changes: 37,191 additions & 0 deletions example_data/lm_features.csv

Large diffs are not rendered by default.

37,190 changes: 37,190 additions & 0 deletions example_data/lm_struct.csv

Large diffs are not rendered by default.

49 changes: 48 additions & 1 deletion rt/applyl1.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,57 @@
"""
Robbin Bouwmeester
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
This code is used to train retention time predictors and store
predictions from a CV procedure for further analysis.
This project was made possible by MASSTRPLAN. MASSTRPLAN received funding
from the Marie Sklodowska-Curie EU Framework for Research and Innovation
Horizon 2020, under Grant Agreement No. 675132.
"""

from os import listdir
from os.path import isfile, join
import pickle
import pandas
from sklearn.preprocessing import maxabs_scale

def apply_models(X,outfile="",model_path="mods_l1/",known_rt=[],row_identifiers=[],skip_cont=[]):
"""
Apply the models from Layer 1
Parameters
----------
X : pd.DataFrame
dataframe with molecular descriptors
outfile : str
specify the outfile
mol_path : str
path to models that need to be applied in Layer 1
known_rt : list
list with known retention times (equal to order in X)
row_identifiers : list
identifiers for each row (equal to order in X)
skip_cont : list
skip these models (provide file names)
Returns
-------
list
list with predictions
list
list with skipped models
"""
model_fn = [f for f in listdir(model_path) if isfile(join(model_path, f))]
preds = []
t_preds = []
Expand All @@ -30,7 +77,7 @@ def apply_models(X,outfile="",model_path="mods_l1/",known_rt=[],row_identifiers=
print("Applying model: %s" % (join(model_path, f)))
with open(join(model_path, f),"rb") as model_f:
try: model = pickle.load(model_f,encoding='latin1')
except: continue #print("Unable to load: %s" % (model_f))
except: print("Unable to load: %s" % (model_f))
if "_SVM" in f:
X_temp = maxabs_scale(X)
preds.append(model.predict(X_temp))
Expand Down
99 changes: 99 additions & 0 deletions rt/getf.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,70 @@
"""
Robbin Bouwmeester
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
This code is used to train retention time predictors and store
predictions from a CV procedure for further analysis.
This project was made possible by MASSTRPLAN. MASSTRPLAN received funding
from the Marie Sklodowska-Curie EU Framework for Research and Innovation
Horizon 2020, under Grant Agreement No. 675132.
"""

from rdkit import Chem
from rdkit.Chem import Descriptors
from subprocess import Popen
from subprocess import PIPE
from os import remove

def rdkit_descriptors(mol):
"""
Get the rdkit descriptors
Parameters
----------
mol : rdkit.Chem.rdchem.Mol
molecule object from rdkit
Returns
-------
dict
feature to molecular descriptor dictionary
"""
ret_dict = {}

# Iterate over all molecular descriptors
for name,func in Descriptors.descList:
ret_dict[name] = func(mol)
return(ret_dict)

def cdk_descriptors(mol,temp_f_smiles_name="tempsmiles.smi",temp_f_cdk_name="tempcdk.txt"):
"""
Get the cdk descriptors
Parameters
----------
mol : rdkit.Chem.rdchem.Mol
molecule object from rdkit
temp_f_smiles_name : str
temporary file for storing the smile
temp_f_cdk_name : str
temporary file for storing the cdk output
Returns
-------
dict
feature to molecular descriptor dictionary
"""
ret_dict = {}

smiles = Chem.MolToSmiles(mol,1)
Expand All @@ -32,18 +86,63 @@ def cdk_descriptors(mol,temp_f_smiles_name="tempsmiles.smi",temp_f_cdk_name="tem


def call_cdk(infile="",outfile="",descriptors=""):
"""
Call to getting cdk descriptors
Parameters
----------
descriptors :
name of the descriptors
infile : str
file for storing the smile
outfile : str
file for storing the cdk output
Returns
-------
dict
feature to molecular descriptor dictionary
"""
cmd = "java -jar CDKDescUI-1.4.6.jar -b %s -a -t %s -o %s" % (infile,descriptors,outfile)
p = Popen(cmd, stdout=PIPE, stderr=PIPE, shell=True)
out = p.communicate()
return(parse_cdk_file(outfile))

def parse_cdk_file(file):
"""
Parse the output of cdk to a dictionary
Parameters
----------
file : str
file to parse from cdk
Returns
-------
dict
feature to molecular descriptor dictionary
"""
cdk_file = open(file).readlines()
cols = cdk_file[0].strip().split()[1:]
feats = cdk_file[1].strip().split()[1:]
return(dict(zip(cols, feats)))

def getf(mol,progs=["rdkit"]):
"""
Get molecular descriptors for a molecule
Parameters
----------
mol : rdkit.Chem.rdchem.Mol
molecule object from rdkit
progs : list
choose either rdkit and/or cdk
Returns
-------
dict
feature to molecular descriptor dictionary
"""
ret_dict = {}
if "rdkit" in progs: ret_dict["rdkit"] = rdkit_descriptors(mol)
if "cdk" in progs: ret_dict["cdk"] = cdk_descriptors(mol)
Expand Down
22 changes: 22 additions & 0 deletions rt/initial_train.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,25 @@
"""
Robbin Bouwmeester
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
This code is used to train retention time predictors and store
predictions from a CV procedure for further analysis.
This project was made possible by MASSTRPLAN. MASSTRPLAN received funding
from the Marie Sklodowska-Curie EU Framework for Research and Innovation
Horizon 2020, under Grant Agreement No. 675132.
"""

import subprocess

from random import shuffle
Expand Down
113 changes: 110 additions & 3 deletions rt/main.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,25 @@
"""
Robbin Bouwmeester
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
This code is used to train retention time predictors and store
predictions from a CV procedure for further analysis.
This project was made possible by MASSTRPLAN. MASSTRPLAN received funding
from the Marie Sklodowska-Curie EU Framework for Research and Innovation
Horizon 2020, under Grant Agreement No. 675132.
"""

import subprocess

from random import shuffle
Expand All @@ -16,22 +38,75 @@
import os

def move_models(k):
"""
Move models so they will not be used in Layer 1
Parameters
----------
k : str
key name for the models that need to be moved
Returns
-------
"""
cmd = "mv mods_l1/%s*.pickle mods_l1/temp/" % (k)
p = subprocess.Popen(cmd, stdout=subprocess.PIPE,stderr=subprocess.PIPE,shell=True)
p.communicate()


def remove_models(k,n):
"""
Remove specific models
Parameters
----------
k : str
key name for the models that need to be moved
n : str
specific numeric identifier for the model to be remove
Returns
-------
"""
cmd = "rm -rf mods_l1/%s*.pickle" % (k)
p = subprocess.Popen(cmd, stdout=subprocess.PIPE,stderr=subprocess.PIPE,shell=True)
p.communicate()

def move_models_back(k):
"""
Move models back so they will be used in Layer 1
Parameters
----------
k : str
key name for the models that need to be moved
Returns
-------
"""
cmd = "mv mods_l1/temp/%s*.pickle mods_l1/" % (k)
p = subprocess.Popen(cmd, stdout=subprocess.PIPE,stderr=subprocess.PIPE,shell=True)
p.communicate()

def cv_to_fold(cv,num_ins):
"""
Define a CV in a pre-defined list
Parameters
----------
cv : sklearn.model_selection.KFold
cv to be put into the list
num_ins : int
number of folds
Returns
-------
list
defined cv
"""
ret_vec = [0]*num_ins
counter_f = 0
for train,test in cv:
Expand All @@ -41,13 +116,43 @@ def cv_to_fold(cv,num_ins):
return(ret_vec)

def make_preds(reference_infile="train_set_lpp2.csv",pred_infile="lmfeatures.csv",k="MASSTRPLAN",outfile="",extra_pred_file="",outfile_modname="",num_jobs=4,GUI_obj=None,ch_size=100000):
os.chdir("rt/")
"""
Make predictions for the evaluation of CALLC
Parameters
----------
reference_infile : str
location of train data
pred_infile : str
location of file to make predictions for
k : str
key name to add to predictions and models
outfile : str
outfile for the predictions
outfile_modname : str
name for the models it will train
num_jobs : int
number of threads to spawn
GUI_obj : object
gui object to update log
ch_size : int
chunk size for generating predictions
Returns
-------
"""
try: os.chdir("rt/")
except: pass

ref_infile = pd.read_csv(reference_infile)

# Make sure we have the correct data types
dict_dtypes = dict(ref_infile.select_dtypes(include=['int']).apply(pd.to_numeric,downcast="integer").dtypes)
float_dtypes = dict(ref_infile.select_dtypes(include=['float']).apply(pd.to_numeric,downcast="float").dtypes)
dict_dtypes.update(float_dtypes)

# See if we need to chunk the predictions to be memory efficient
tot_preds = sum(1 for row in open(pred_infile,"r"))/ch_size
p_infile = pd.read_csv(pred_infile,dtype=dict_dtypes,chunksize=ch_size)

Expand All @@ -66,16 +171,18 @@ def make_preds(reference_infile="train_set_lpp2.csv",pred_infile="lmfeatures.csv
print("===========")
print("Total number of train molecules with tR: %s" % (n))

# Make sure that for the training data we do not have infinite or nan
train = ref_infile
train = train.replace([np.inf, -np.inf], np.nan)
train = train.fillna(0.0)

#len(train.index),
cv = KFold(n_splits=5,shuffle=True,random_state=42)
# Define the folds to make predictions
cv = KFold(n_splits=10,shuffle=True,random_state=42)
cv = list(cv.split(train.index))

cv_list = cv_to_fold(cv,len(train.index))

# Do layer 1 outside of the chunking
preds_own = train_l1_func(train,names=[k,k,k,k,k,k,k],adds=[n,n,n,n,n,n,n,n],cv=cv,outfile_modname=outfile_modname,n_jobs=num_jobs)
preds_l1_train,skipped_train = apply_models(train.drop(["time","IDENTIFIER","system"],axis=1, errors='ignore'),known_rt=train["time"],row_identifiers=train["IDENTIFIER"],skip_cont=[k])
preds_l1_train = pd.concat([preds_l1_train.reset_index(drop=True), preds_own], axis=1)
Expand Down
Loading

0 comments on commit e1b02e2

Please sign in to comment.