Scripts used in the analysis of Phytophthora genomes
ands used during analysis of phytophthora genomes. Note - all this work was performed in the directory: /projects/oldhome/groups/harrisonlab/project_files/idris
The following is a summary of the work presented in this Readme: Data organisation:
- Preparing data
Draft Genome assembly - Data qc
- Genome assembly
- Repeatmasking
- Gene prediction
- Functional annotation Genome analysis
- Homology between predicted genes & published effectors
#Data organisation
Data was copied from the raw_data repository to a local directory for assembly and annotation.
cd /projects/oldhome/groups/harrisonlab/project_files/idris
mkdir -p raw_dna/paired/P.cactorum/404/F
mkdir -p raw_dna/paired/P.cactorum/404/R
mkdir -p raw_dna/paired/P.cactorum/414/F
mkdir -p raw_dna/paired/P.cactorum/414/R
RawDat=/projects/oldhome/groups/harrisonlab/raw_data/raw_seq/cactorum/Cactorum/130614_P404
cp $RawDat/cactp404_S3_L001_R1_001.fastq.gz raw_dna/paired/P.cactorum/404/F/.
cp $RawDat/cactp404_S3_L001_R2_001.fastq.gz raw_dna/paired/P.cactorum/404/R/.
RawDat=/projects/oldhome/groups/harrisonlab/raw_data/raw_seq/cactorum/Cactorum/130624_P404
cp $RawDat/130624_cactp404_S3_L001_R1_001.fastq.gz raw_dna/paired/P.cactorum/404/F/.
cp $RawDat/130624_cactp404_S3_L001_R2_001.fastq.gz raw_dna/paired/P.cactorum/404/R/.
RawDat="/projects/oldhome/groups/harrisonlab/raw_data/raw_seq/cactorum/Cactorum/Cactorum\ 130120"
cp /projects/oldhome/groups/harrisonlab/raw_data/raw_seq/cactorum/Cactorum/Cactorum\ 130120/cact414_S2_L001_R1_001.fastq.gz raw_dna/paired/P.cactorum/414/F/414_run1_F.fastq.gz
cp /projects/oldhome/groups/harrisonlab/raw_data/raw_seq/cactorum/Cactorum/Cactorum\ 130120/cact414_S2_L001_R2_001.fastq.gz raw_dna/paired/P.cactorum/414/R/414_run1_R.fastq.gz
RawDat="/projects/oldhome/groups/harrisonlab/raw_data/raw_seq/cactorum/Cactorum/Cactorum\ 130517"
cp /projects/oldhome/groups/harrisonlab/raw_data/raw_seq/cactorum/Cactorum/Cactorum\ 130517/cact414_S2_L001_R1_001.fastq.gz raw_dna/paired/P.cactorum/414/F/414_run2_F.fastq.gz
cp /projects/oldhome/groups/harrisonlab/raw_data/raw_seq/cactorum/Cactorum/Cactorum\ 130517/cact414_S2_L001_R2_001.fastq.gz raw_dna/paired/P.cactorum/414/R/414_run2_R.fastq.gz
mkdir -p raw_dna/paired/P.cactorum/415/F
mkdir -p raw_dna/paired/P.cactorum/415/R
mkdir -p raw_dna/paired/P.cactorum/416/F
mkdir -p raw_dna/paired/P.cactorum/416/R
mkdir -p raw_dna/paired/P.cactorum/62471/F/.
mkdir -p raw_dna/paired/P.cactorum/62471/R/.
RawDat=/projects/oldhome/groups/harrisonlab/raw_data/raw_seq/raw_reads/150716_M01678_0023_AB0YF
cp $RawDat/Pcactorum415_S2_L001_R1_001.fastq.gz raw_dna/paired/P.cactorum/415/F/.
cp $RawDat/Pcactorum415_S2_L001_R2_001.fastq.gz raw_dna/paired/P.cactorum/415/R/.
cp $RawDat/Pcactorum416_S1_L001_R1_001.fastq.gz raw_dna/paired/P.cactorum/416/F/.
cp $RawDat/Pcactorum416_S1_L001_R2_001.fastq.gz raw_dna/paired/P.cactorum/416/R/.
RawDatDir=/projects/oldhome/groups/harrisonlab/raw_data/raw_seq/raw_reads/160108_M01678_0039_AEMMF
cp $RawDatDir/62471_S3_L001_R1_001.fastq.gz raw_dna/paired/P.cactorum/62471/F/.
cp $RawDatDir/62471_S3_L001_R2_001.fastq.gz raw_dna/paired/P.cactorum/62471/R/.
# Data run on 13/10/16 161010_M04465_0026_000000000-APP5B
RawDatDir=/projects/oldhome/miseq_readonly/miseq_data/2016/RAW/161010_M04465_0026_000000000-APP5B/Data/Intensities/BaseCalls
Species=P.cactorum
Strain=15_7
mkdir -p raw_dna/paired/$Species/$Strain/F
mkdir -p raw_dna/paired/$Species/$Strain/R
cp $RawDatDir/Pcact15-07_S2_L001_R1_001.fastq.gz raw_dna/paired/$Species/$Strain/F/.
cp $RawDatDir/Pcact15-07_S2_L001_R2_001.fastq.gz raw_dna/paired/$Species/$Strain/R/.
Species=P.cactorum
Strain=15_13
mkdir -p raw_dna/paired/$Species/$Strain/F
mkdir -p raw_dna/paired/$Species/$Strain/R
cp $RawDatDir/Pcact15-13_S3_L001_R1_001.fastq.gz raw_dna/paired/$Species/$Strain/F/.
cp $RawDatDir/Pcact15-13_S3_L001_R2_001.fastq.gz raw_dna/paired/$Species/$Strain/R/.
# Data run on 10/02/17 170210_M04465_0034_000000000-ATJ0E
RawDatDir=/projects/oldhome/miseq_data/2017/RAW/170210_M04465_0034_000000000-ATJ0E/Data/Intensities/BaseCalls
Date=170210
Species=P.cactorum
Strain=404
mkdir -p raw_dna/paired/$Species/$Strain/F
mkdir -p raw_dna/paired/$Species/$Strain/R
cp $RawDatDir/P404_S3_L001_R1_001.fastq.gz raw_dna/paired/$Species/$Strain/F/"$Strain"_"$Date"_F.fastq.gz
cp $RawDatDir/P404_S3_L001_R2_001.fastq.gz raw_dna/paired/$Species/$Strain/R/"$Strain"_"$Date"_R.fastq.gz
Species=P.cactorum
Strain=414
mkdir -p raw_dna/paired/$Species/$Strain/F
mkdir -p raw_dna/paired/$Species/$Strain/R
cp $RawDatDir/P414_S4_L001_R1_001.fastq.gz raw_dna/paired/$Species/$Strain/F/"$Strain"_"$Date"_F.fastq.gz
cp $RawDatDir/P414_S4_L001_R2_001.fastq.gz raw_dna/paired/$Species/$Strain/R/"$Strain"_"$Date"_R.fastq.gz
Species=P.cactorum
Strain=P295
mkdir -p raw_dna/paired/$Species/$Strain/F
mkdir -p raw_dna/paired/$Species/$Strain/R
cp $RawDatDir/P295_S2_L001_R1_001.fastq.gz raw_dna/paired/$Species/$Strain/F/"$Strain"_"$Date"_F.fastq.gz
cp $RawDatDir/P295_S2_L001_R2_001.fastq.gz raw_dna/paired/$Species/$Strain/R/"$Strain"_"$Date"_R.fastq.gz
Species=P.cactorum
Strain=12420
mkdir -p raw_dna/paired/$Species/$Strain/F
mkdir -p raw_dna/paired/$Species/$Strain/R
cp $RawDatDir/12-420_S1_L001_R1_001.fastq.gz raw_dna/paired/$Species/$Strain/F/"$Strain"_"$Date"_F.fastq.gz
cp $RawDatDir/12-420_S1_L001_R2_001.fastq.gz raw_dna/paired/$Species/$Strain/R/"$Strain"_"$Date"_R.fastq.gz
# For missing data run for isolates 4032, 4040 and PC13/15
# RawDatDir=/projects/oldhome/miseq_data/2016/RAW/160311_M04465_0006_000000000-AKT5J/Data/Intensities/BaseCalls
# Date=160311
RawDat=/projects/oldhome/groups/harrisonlab/raw_data/raw_seq/raw_reads/160404_M004465_0008-ALVUT
mkdir -p raw_dna/paired/P.cactorum/2003_3/F
mkdir -p raw_dna/paired/P.cactorum/2003_3/R
mkdir -p raw_dna/paired/P.idaei/SCRP370/F
mkdir -p raw_dna/paired/P.idaei/SCRP370/R
cp $RawDat/Pc20033_S2_L001_R1_001.fastq.gz raw_dna/paired/P.cactorum/2003_3/F/.
cp $RawDat/Pc20033_S2_L001_R2_001.fastq.gz raw_dna/paired/P.cactorum/2003_3/R/.
cp $RawDat/SCRP370_S3_L001_R1_001.fastq.gz raw_dna/paired/P.idaei/SCRP370/F/.
cp $RawDat/SCRP370_S3_L001_R2_001.fastq.gz raw_dna/paired/P.idaei/SCRP370/R/.
RawDat=/projects/oldhome/groups/harrisonlab/raw_data/raw_seq/raw_reads/160412_M04465_0010-AMLCU
mkdir -p raw_dna/paired/P.rubi/SCRP249/F
mkdir -p raw_dna/paired/P.rubi/SCRP249/R
mkdir -p raw_dna/paired/P.rubi/SCRP324/F
mkdir -p raw_dna/paired/P.rubi/SCRP324/R
mkdir -p raw_dna/paired/P.rubi/SCRP333/F
mkdir -p raw_dna/paired/P.rubi/SCRP333/R
cp $RawDat/SCRP249_S1_L001_R1_001.fastq.gz raw_dna/paired/P.rubi/SCRP249/F/.
cp $RawDat/SCRP249_S1_L001_R2_001.fastq.gz raw_dna/paired/P.rubi/SCRP249/R/.
cp $RawDat/SCRP324_S2_L001_R1_001.fastq.gz raw_dna/paired/P.rubi/SCRP324/F/.
cp $RawDat/SCRP324_S2_L001_R2_001.fastq.gz raw_dna/paired/P.rubi/SCRP324/R/.
cp $RawDat/SCRP333_S3_L001_R1_001.fastq.gz raw_dna/paired/P.rubi/SCRP333/F/.
cp $RawDat/SCRP333_S3_L001_R2_001.fastq.gz raw_dna/paired/P.rubi/SCRP333/R/.
RawDat=/projects/oldhome/groups/harrisonlab/raw_data/raw_seq/raw_reads/160415_M004465_00011-AMLCL
mkdir -p raw_dna/paired/P.cactorum/2003_3/F
mkdir -p raw_dna/paired/P.cactorum/2003_3/R
mkdir -p raw_dna/paired/P.cactorum/PC13_15/F
mkdir -p raw_dna/paired/P.cactorum/PC13_15/R
mkdir -p raw_dna/paired/P.idaei/SCRP376/F
mkdir -p raw_dna/paired/P.idaei/SCRP376/R
cp $RawDat/20033_S4_L001_R1_001.fastq.gz raw_dna/paired/P.cactorum/2003_3/F/.
cp $RawDat/20033_S4_L001_R2_001.fastq.gz raw_dna/paired/P.cactorum/2003_3/R/.
cp $RawDat/PC1315_S5_L001_R1_001.fastq.gz raw_dna/paired/P.cactorum/PC13_15/F/.
cp $RawDat/PC1315_S5_L001_R2_001.fastq.gz raw_dna/paired/P.cactorum/PC13_15/R/.
cp $RawDat/SCRP376_S1_L001_R1_001.fastq.gz raw_dna/paired/P.idaei/SCRP376/F/.
cp $RawDat/SCRP376_S1_L001_R2_001.fastq.gz raw_dna/paired/P.idaei/SCRP376/R/.
RawDat=/projects/oldhome/miseq_data/.tmp_nas_data/miseq_data/miseq_data/RAW/2016/160223_M04465_0003_000000000-AJ4TR/Data/Intensities/BaseCalls
mkdir -p raw_dna/paired/P.cactorum/415/F
mkdir -p raw_dna/paired/P.cactorum/415/R
mkdir -p raw_dna/paired/P.cactorum/416/F
mkdir -p raw_dna/paired/P.cactorum/416/R
mkdir -p raw_dna/paired/P.cactorum/R36_14/F
mkdir -p raw_dna/paired/P.cactorum/R36_14/R
cp $RawDat/P415_S1_L001_R1_001.fastq.gz raw_dna/paired/P.cactorum/415/F/.
cp $RawDat/P415_S1_L001_R2_001.fastq.gz raw_dna/paired/P.cactorum/415/R/.
cp $RawDat/P416_S2_L001_R1_001.fastq.gz raw_dna/paired/P.cactorum/416/F/.
cp $RawDat/P416_S2_L001_R2_001.fastq.gz raw_dna/paired/P.cactorum/416/R/.
cp $RawDat/R3614_S3_L001_R1_001.fastq.gz raw_dna/paired/P.cactorum/R36_14/F/.
cp $RawDat/R3614_S3_L001_R2_001.fastq.gz raw_dna/paired/P.cactorum/R36_14/R/.
RawDat=/projects/oldhome/miseq_data/.tmp_nas_data/miseq_data/miseq_data/RAW/2016/160311_M04465_0006_000000000-AKT5J/Data/Intensities/BaseCalls
mkdir -p raw_dna/paired/P.cactorum/4032/F
mkdir -p raw_dna/paired/P.cactorum/4032/R
mkdir -p raw_dna/paired/P.cactorum/4040/F
mkdir -p raw_dna/paired/P.cactorum/4040/R
mkdir -p raw_dna/paired/P.cactorum/PC13_15/F
mkdir -p raw_dna/paired/P.cactorum/PC13_15/R
cp $RawDat/Pcact-4032_S1_L001_R1_001.fastq.gz raw_dna/paired/P.cactorum/4032/F/.
cp $RawDat/Pcact-4032_S1_L001_R2_001.fastq.gz raw_dna/paired/P.cactorum/4032/R/.
cp $RawDat/Pcact-4040_S2_L001_R1_001.fastq.gz raw_dna/paired/P.cactorum/4040/F/.
cp $RawDat/Pcact-4040_S2_L001_R2_001.fastq.gz raw_dna/paired/P.cactorum/4040/R/.
cp $RawDat/Pcact-PC1315_S3_L001_R1_001.fastq.gz raw_dna/paired/P.cactorum/PC13_15/F/.
cp $RawDat/Pcact-PC1315_S3_L001_R2_001.fastq.gz raw_dna/paired/P.cactorum/PC13_15/R/.
RawDat=/data/seq_data/miseq/2017/RAW/171006_M04465_0050_000000000-B49T7/Data/Intensities/BaseCalls
mkdir -p raw_dna/paired/P.cactorum/11-40/F
mkdir -p raw_dna/paired/P.cactorum/11-40/R
mkdir -p raw_dna/paired/P.cactorum/17-21/F
mkdir -p raw_dna/paired/P.cactorum/17-21/R
cp $RawDat/Pcact-CN11-40_S3_L001_R1_001.fastq.gz raw_dna/paired/P.cactorum/11-40/F/.
cp $RawDat/Pcact-CN11-40_S3_L001_R2_001.fastq.gz raw_dna/paired/P.cactorum/11-40/R/.
cp $RawDat/Pcact-CN17-21_S1_L001_R1_001.fastq.gz raw_dna/paired/P.cactorum/17-21/F/.
cp $RawDat/Pcact-CN17-21_S1_L001_R2_001.fastq.gz raw_dna/paired/P.cactorum/17-21/R/.
ProjectDir=/projects/oldhome/groups/harrisonlab/project_files/idris
RawDat=/data/seq_data/miseq/2018/RAW/180213_M04465_0067_000000000-BJ4DR/Data/Intensities/BaseCalls
OutDir=raw_dna/paired/P.cactorum/P421/F
mkdir -p $OutDir
cd $OutDir
cp -s $RawDat/P421_S3_L001_R1_001.fastq.gz .
cd $ProjectDir
OutDir=raw_dna/paired/P.cactorum/P421/R
mkdir -p $OutDir
cd $OutDir
cp $RawDat/P421_S3_L001_R2_001.fastq.gz .
cd $ProjectDir
#Data qc
programs: fastqc fastq-mcf kmc
Data quality was visualised using fastqc:
# for RawData in $(ls raw_dna/paired/P.*/*/*/*.fastq.gz | grep '170210'); do
for RawData in $(ls raw_dna/paired/P.cactorum/*/*/*.fastq.gz | grep -v '10300' | grep -e 'P421'); do
echo $RawData;
ProgDir=/projects/oldhome/armita/git_repos/emr_repos/tools/seq_tools/dna_qc;
qsub $ProgDir/run_fastqc.sh $RawData;
done
Trimming was performed on data to trim adapters from sequences and remove poor quality data. This was done with fastq-mcf
Firstly, those strains with more than one run were identified:
for Strain in $(ls -d raw_dna/paired/P.*/* | grep -e 'P.cactorum' -e 'P.idaei'); do
NumReads=$(ls $Strain/F/*.gz | wc -l);
if [ $NumReads -gt 1 ]; then
echo "$Strain";
echo "$NumReads";
fi;
done
for StrainPath in $(ls -d raw_dna/paired/P.*/* | grep -v -w -e '10300' -e '404' -e '414' -e '415' -e '416' -e 'PC13_15' -e '2003_3' | grep -e 'P.cactorum' -e 'P.idaei' | grep -e 'P421'); do
# for StrainPath in $(ls -d raw_dna/paired/*/* | grep -e 'idaei'); do
Jobs=$(qstat | grep 'rna_qc_' | grep 'qw' | wc -l)
while [ $Jobs -gt 1 ]; do
sleep 1m
printf "."
Jobs=$(qstat | grep 'rna_qc_' | grep 'qw' | wc -l)
done
echo $StrainPath
Read_F=$(ls $StrainPath/F/*.fastq.gz)
Read_R=$(ls $StrainPath/R/*.fastq.gz)
IluminaAdapters=/projects/oldhome/armita/git_repos/emr_repos/tools/seq_tools/ncbi_adapters.fa
ProgDir=/projects/oldhome/armita/git_repos/emr_repos/tools/seq_tools/rna_qc
qsub $ProgDir/rna_qc_fastq-mcf.sh $Read_F $Read_R $IluminaAdapters DNA
done
Trimming was then performed for strains with two runs of data
for StrainPath in $(ls -d raw_dna/paired/P.cactorum/* | grep -w -e '415' -e '416' -e 'PC13_15' -e '2003_3'
| grep '2003_3'); do
Jobs=$(qstat | grep 'rna_qc_' | grep 'qw' | wc -l)
while [ $Jobs -gt 1 ]; do
sleep 1m
printf "."
Jobs=$(qstat | grep 'rna_qc_' | grep 'qw' | wc -l)
done
echo $StrainPath
Read_F=$(ls $StrainPath/F/*.fastq.gz | head -n1 | tail -n1)
Read_R=$(ls $StrainPath/R/*.fastq.gz | head -n1 | tail -n1)
IluminaAdapters=/projects/oldhome/armita/git_repos/emr_repos/tools/seq_tools/ncbi_adapters.fa
ProgDir=/projects/oldhome/armita/git_repos/emr_repos/tools/seq_tools/rna_qc
qsub $ProgDir/rna_qc_fastq-mcf.sh $Read_F $Read_R $IluminaAdapters DNA
Read_F=$(ls $StrainPath/F/*.fastq.gz | head -n2 | tail -n1)
Read_R=$(ls $StrainPath/R/*.fastq.gz | head -n2 | tail -n1)
IluminaAdapters=/projects/oldhome/armita/git_repos/emr_repos/tools/seq_tools/ncbi_adapters.fa
ProgDir=/projects/oldhome/armita/git_repos/emr_repos/tools/seq_tools/rna_qc
qsub $ProgDir/rna_qc_fastq-mcf.sh $Read_F $Read_R $IluminaAdapters DNA
done
for three sets of data:
for StrainPath in $(ls -d raw_dna/paired/P.cactorum/* | grep -w -e '404' -e '414'); do
Jobs=$(qstat | grep 'rna_qc_' | grep 'qw' | wc -l)
while [ $Jobs -gt 1 ]; do
sleep 1m
printf "."
Jobs=$(qstat | grep 'rna_qc_' | grep 'qw' | wc -l)
done
echo $StrainPath
Read_F=$(ls $StrainPath/F/*.fastq.gz | head -n1 | tail -n1)
Read_R=$(ls $StrainPath/R/*.fastq.gz | head -n1 | tail -n1)
IluminaAdapters=/projects/oldhome/armita/git_repos/emr_repos/tools/seq_tools/ncbi_adapters.fa
ProgDir=/projects/oldhome/armita/git_repos/emr_repos/tools/seq_tools/rna_qc
qsub $ProgDir/rna_qc_fastq-mcf.sh $Read_F $Read_R $IluminaAdapters DNA
Read_F=$(ls $StrainPath/F/*.fastq.gz | head -n2 | tail -n1)
Read_R=$(ls $StrainPath/R/*.fastq.gz | head -n2 | tail -n1)
IluminaAdapters=/projects/oldhome/armita/git_repos/emr_repos/tools/seq_tools/ncbi_adapters.fa
ProgDir=/projects/oldhome/armita/git_repos/emr_repos/tools/seq_tools/rna_qc
qsub $ProgDir/rna_qc_fastq-mcf.sh $Read_F $Read_R $IluminaAdapters DNA
Read_F=$(ls $StrainPath/F/*.fastq.gz | head -n3 | tail -n1)
Read_R=$(ls $StrainPath/R/*.fastq.gz | head -n3 | tail -n1)
IluminaAdapters=/projects/oldhome/armita/git_repos/emr_repos/tools/seq_tools/ncbi_adapters.fa
ProgDir=/projects/oldhome/armita/git_repos/emr_repos/tools/seq_tools/rna_qc
qsub $ProgDir/rna_qc_fastq-mcf.sh $Read_F $Read_R $IluminaAdapters DNA
done
Data quality was visualised once again following trimming:
for TrimData in $(ls qc_dna/paired/P.cactorum/*/*/*.fq.gz | grep -e 'P421'); do
echo $TrimData;
ProgDir=/projects/oldhome/armita/git_repos/emr_repos/tools/seq_tools/dna_qc;
qsub $ProgDir/run_fastqc.sh $TrimData;
done
Sequencing coveraqge was estimated:
for RawData in $(ls qc_dna/paired/P.*/*/*/*q.gz | grep -e 'P421'); do
echo $RawData;
ProgDir=/projects/oldhome/armita/git_repos/emr_repos/tools/seq_tools/dna_qc;
qsub $ProgDir/run_fastqc.sh $RawData;
# GenomeSz=65
GenomeSz=60
OutDir=$(dirname $RawData)
qsub $ProgDir/sub_count_nuc.sh $GenomeSz $RawData $OutDir
done
Find predicted coverage for these isolates:
for StrainDir in $(ls -d qc_dna/paired/P.*/* | grep -v -w -e '411' -e '10300_old' | grep -e 'cactorum' -e 'idaei' | grep -e '11-40' -e '17-21' -e 'P421'); do
Strain=$(basename $StrainDir)
printf "$Strain\t"
for File in $(ls qc_dna/paired/P.*/"$Strain"/*/*.txt); do
echo $(basename $File);
cat $File | tail -n1 | rev | cut -f2 -d ' ' | rev;
done | grep -v '.txt' | awk '{ SUM += $1} END { print SUM }'
done
10300 206.15
12420 25.6
15_13 55.63
15_7 59.71
2003_3 69.29
4032 53.14
404 69.52
4040 67.21
414 76.49
415 86.64
416 106.94
62471 72.21
P295 39.29
PC13_15 77.83
R36_14 56.09
371 71.89
SCRP370 40.91
SCRP376 49.19
11-40 68.9
17-21 78.86
P421 48.22
#Assembly
Assembly was performed with:
- Spades
Assembly was submitted for genomes with a single run of data
for StrainPath in $(ls -d qc_dna/paired/P.*/* | grep -v -w -e '10300' -e '404' -e '414' -e '415' -e '416' -e 'PC13_15' -e '2003_3'| grep -e 'P.cactorum' -e 'P.idaei' | grep -e '11-40' -e '17-21' -e 'P421'); do
ProgDir=/projects/oldhome/armita/git_repos/emr_repos/tools/seq_tools/assemblers/spades
Strain=$(echo $StrainPath | rev | cut -f1 -d '/' | rev)
Organism=$(echo $StrainPath | rev | cut -f2 -d '/' | rev)
F_Read=$(ls $StrainPath/F/*_trim.fq.gz)
R_Read=$(ls $StrainPath/R/*_trim.fq.gz)
OutDir=assembly/spades/$Organism/$Strain
Jobs=$(qstat | grep -e 'subSpades' -e 'submit_SPA' | grep 'qw' | wc -l)
while [ $Jobs -gt 1 ]; do
sleep 5m
printf "."
Jobs=$(qstat | grep -e 'subSpades' -e 'submit_SPA' | grep 'qw' | wc -l)
done
printf "\n"
echo $F_Read
echo $R_Read
qsub $ProgDir/submit_SPAdes_HiMem.sh $F_Read $R_Read $OutDir correct
done
For isolates with two runs of data:
for StrainPath in $(ls -d qc_dna/paired/P.cactorum/* | grep -w -e '415' -e '416' -e 'PC13_15' -e '2003_3' | grep -e '2003_3'); do
echo $StrainPath
ProgDir=/projects/oldhome/armita/git_repos/emr_repos/tools/seq_tools/assemblers/spades/multiple_libraries
Strain=$(echo $StrainPath | rev | cut -f1 -d '/' | rev)
Organism=$(echo $StrainPath | rev | cut -f2 -d '/' | rev)
echo $Strain
echo $Organism
TrimF1_Read=$(ls $StrainPath/F/*_trim.fq.gz | head -n1 | tail -n1);
TrimR1_Read=$(ls $StrainPath/R/*_trim.fq.gz | head -n1 | tail -n1);
TrimF2_Read=$(ls $StrainPath/F/*_trim.fq.gz | head -n2 | tail -n1);
TrimR2_Read=$(ls $StrainPath/R/*_trim.fq.gz | head -n2 | tail -n1);
echo $TrimF1_Read
echo $TrimR1_Read
echo $TrimF2_Read
echo $TrimR2_Read
OutDir=assembly/spades/$Organism/$Strain
Jobs=$(qstat | grep -e 'subSpades' -e 'submit_SPA' | grep 'qw' | wc -l)
while [ $Jobs -gt 1 ]; do
sleep 5m
printf "."
Jobs=$(qstat | grep -e 'subSpades' -e 'submit_SPA' | grep 'qw' | wc -l)
done
printf "\n"
qsub $ProgDir/subSpades_2lib_HiMem.sh $TrimF1_Read $TrimR1_Read $TrimF2_Read $TrimR2_Read $OutDir correct
done
for isolates with three runs of data:
for StrainPath in $(ls -d qc_dna/paired/P.cactorum/* | grep -w -e '404' -e '414'); do
echo $StrainPath
ProgDir=/projects/oldhome/armita/git_repos/emr_repos/tools/seq_tools/assemblers/spades/multiple_libraries
Strain=$(echo $StrainPath | rev | cut -f1 -d '/' | rev)
Organism=$(echo $StrainPath | rev | cut -f2 -d '/' | rev)
echo $Strain
echo $Organism
TrimF1_Read=$(ls $StrainPath/F/*_trim.fq.gz | head -n1 | tail -n1);
TrimR1_Read=$(ls $StrainPath/R/*_trim.fq.gz | head -n1 | tail -n1);
TrimF2_Read=$(ls $StrainPath/F/*_trim.fq.gz | head -n2 | tail -n1);
TrimR2_Read=$(ls $StrainPath/R/*_trim.fq.gz | head -n2 | tail -n1);
TrimF3_Read=$(ls $StrainPath/F/*_trim.fq.gz | head -n3 | tail -n1);
TrimR3_Read=$(ls $StrainPath/R/*_trim.fq.gz | head -n3 | tail -n1);
echo $TrimF1_Read
echo $TrimR1_Read
echo $TrimF2_Read
echo $TrimR2_Read
echo $TrimF3_Read
echo $TrimR3_Read
OutDir=assembly/spades/$Organism/$Strain
Jobs=$(qstat | grep -e 'subSpades' -e 'submit_SPA' | grep 'qw' | wc -l)
while [ $Jobs -gt 1 ]; do
sleep 5m
printf "."
Jobs=$(qstat | grep -e 'subSpades' -e 'submit_SPA' | grep 'qw' | wc -l)
done
printf "\n"
qsub $ProgDir/subSpades_3lib_HiMem.sh $TrimF1_Read $TrimR1_Read $TrimF2_Read $TrimR2_Read $TrimF3_Read $TrimR3_Read $OutDir correct
done
Assembly results were summarised using Quast:
for Assembly in $(ls assembly/spades/*/*/filtered_contigs*/contigs_min_500bp.fasta | grep -e 'P.cactorum' -e 'P.idaei' | grep -e '11-40' -e '17-21' -e 'P421'); do
Strain=$(echo $Assembly | rev | cut -f3 -d '/' | rev);
Organism=$(echo $Assembly | rev | cut -f4 -d '/' | rev);
OutDir=$(dirname $Assembly)
ProgDir=/projects/oldhome/armita/git_repos/emr_repos/tools/seq_tools/assemblers/assembly_qc/quast
qsub $ProgDir/sub_quast.sh $Assembly $OutDir;
done
Contigs were identified that had blast hits to non-phytophthora genomes
for Assembly in $(ls assembly/spades/*/*/filtered_contigs/contigs_min_500bp.fasta | grep -e 'P.cactorum' -e 'P.idaei' | grep -e '11-40' -e '17-21' -e 'P421' | grep 'P421_v2'); do
Strain=$(echo $Assembly | rev | cut -f3 -d '/' | rev)
Organism=$(echo $Assembly | rev | cut -f4 -d '/' | rev)
echo "$Organism - $Strain"
for Exclude_db in "bacillus" "delftia" "paenibacillus"; do
Good_db="phytoph"
AssemblyDir=$(dirname $Assembly)
OutDir=$AssemblyDir/../deconseq_$Exclude_db
ProgDir=/projects/oldhome/armita/git_repos/emr_repos/tools/seq_tools/assemblers/assembly_qc/remove_contaminants
qsub $ProgDir/sub_deconseq.sh $Assembly $Exclude_db $Good_db $OutDir
done
done
Results were summarised using the commands:
# for File in $(ls assembly/spades/P.*/*/deconseq/log.txt); do
# for File in $(ls assembly/spades/P.*/*/deconseq_common/log.txt); do
for Exclude_db in "bacillus" "delftia" "paenibacillus"; do
echo $Exclude_db
for File in $(ls assembly/spades/P.*/*/*/log.txt | grep "$Exclude_db" | grep -e '11-40' -e '17-21' -e 'P421'); do
Name=$(echo $File | rev | cut -f3 -d '/' | rev);
Good=$(cat $File |cut -f2 | head -n1 | tail -n1);
Both=$(cat $File |cut -f2 | head -n2 | tail -n1);
Bad=$(cat $File |cut -f2 | head -n3 | tail -n1);
printf "$Name\t$Good\t$Both\t$Bad\n";
done
done
Results from Paen
12420 5508 3 30
15_13 5419 2 1
15_7 5457 1 7
2003_3 5473 5 111
4032 6705 28 1669
404 20143 2 49
4040 5324 3 615
414 5107 2 924
415 5707 2 7
416 5501 2 7
62471 6156 24 946
P295 5088 2 99
PC13_15 5275 3 39
R36_14 5957 3 38
371 4709 1 1
SCRP370 5355 2 1
SCRP376 5313 1 1
11-40 4573 1 1
17-21 4451 2 1
P421 8078 2 3
Results from delftia
12420 5534 1 6
15_13 5420 1 1
15_7 5462 0 3
2003_3 5583 0 6
4032 8396 2 4
4040 5938 2 2
404 20184 0 10
414 6029 0 4
415 5713 0 3
416 5507 0 3
62471 7123 1 2
P295 5179 1 9
PC13_15 5309 0 8
R36_14 5990 0 8
371 4709 1 1
SCRP370 5355 2 1
SCRP376 5313 1 1
11-40 4573 1 1
17-21 4452 1 1
P421 8065 2 16
Results from bacillus
12420 5529 3 9
15_13 5419 2 1
15_7 5461 1 3
2003_3 5577 1 11
4032 8330 2 70
4040 5890 5 47
404 20175 2 17
414 5995 1 37
415 5693 3 20
416 5486 2 22
62471 7056 5 65
P295 5162 3 24
PC13_15 5302 1 14
R36_14 5975 1 22
371 4710 0 1
SCRP370 5356 1 1
SCRP376 5314 0 1
bacillus
11-40 4573 1 1
17-21 4450 3 1
17-21 4451 2 1
P421 8077 3 3
P421 8078 2 3
for Assembly in $(ls assembly/spades/*/*/filtered_contigs/contigs_min_500bp.fasta | grep -e 'P.cactorum' -e 'P.idaei' | grep -e '11-40' -e '17-21' -e 'P421' | grep 'P421_v2'); do
Strain=$(echo $Assembly | rev | cut -f3 -d '/' | rev)
Organism=$(echo $Assembly | rev | cut -f4 -d '/' | rev)
echo "$Organism - $Strain"
for Exclude_db in "stenotrophomonas"; do
Good_db="phytoph"
AssemblyDir=$(dirname $Assembly)
OutDir=$AssemblyDir/../deconseq_$Exclude_db
ProgDir=/projects/oldhome/armita/git_repos/emr_repos/tools/seq_tools/assemblers/assembly_qc/remove_contaminants
qsub $ProgDir/sub_deconseq.sh $Assembly $Exclude_db $Good_db $OutDir
done
done
Those contigs that were not identified as contaminants in each of the filtering steps were retained.
for Assembly in $(ls assembly/spades/*/*/filtered_contigs/contigs_min_500bp.fasta | grep -e 'P.cactorum' -e 'P.idaei'| grep -e '11-40' -e '17-21' -e 'P421' | grep -v -w 'P421'); do
Strain=$(echo $Assembly | rev | cut -f3 -d '/' | rev)
Organism=$(echo $Assembly | rev | cut -f4 -d '/' | rev)
echo "$Organism - $Strain"
StrainDir=$(ls -d assembly/spades/$Organism/$Strain)
mkdir -p $StrainDir/deconseq_appended
for File in $(ls $StrainDir/deconseq_*/*cont.fa | grep -e "bacillus" -e "delftia" -e "Paen"); do
cat $File | grep '>'
done | sort | uniq | tr -d '>' > $StrainDir/deconseq_appended/exclude_list.txt
Instructions=$StrainDir/deconseq_appended/exclude_instructions.txt
printf "Exclude:\nSequence name, length, apparent source\n" > $Instructions
cat $StrainDir/deconseq_appended/exclude_list.txt | sed -r 's/$/\t.\tcontaminant/g' >> $Instructions
ProgDir=~/git_repos/emr_repos/tools/seq_tools/assemblers/assembly_qc/remove_contaminants
$ProgDir/remove_contaminants.py --keep_mitochondria --inp $Assembly --out $StrainDir/deconseq_appended/contigs_min_500bp_renamed.fasta --coord_file $Instructions
done
Strain P421 was searched against an additional database.
for Assembly in $(ls assembly/spades/*/*/filtered_contigs/contigs_min_500bp.fasta | grep -e 'P.cactorum' -e 'P.idaei' | grep 'P421'); do
Strain=$(echo $Assembly | rev | cut -f3 -d '/' | rev)
Organism=$(echo $Assembly | rev | cut -f4 -d '/' | rev)
echo "$Organism - $Strain"
StrainDir=$(ls -d assembly/spades/$Organism/$Strain)
mkdir -p $StrainDir/deconseq_appended
for File in $(ls $StrainDir/deconseq_*/*cont.fa | grep -e "bacillus" -e "delftia" -e "Paen" -e "stenotrophomonas"); do
cat $File | grep '>'
done | sort | uniq | tr -d '>' > $StrainDir/deconseq_appended/exclude_list.txt
Instructions=$StrainDir/deconseq_appended/exclude_instructions.txt
printf "Exclude:\nSequence name, length, apparent source\n" > $Instructions
cat $StrainDir/deconseq_appended/exclude_list.txt | sed -r 's/$/\t.\tcontaminant/g' >> $Instructions
ProgDir=~/git_repos/emr_repos/tools/seq_tools/assemblers/assembly_qc/remove_contaminants
$ProgDir/remove_contaminants.py --keep_mitochondria --inp $Assembly --out $StrainDir/deconseq_appended/contigs_min_500bp_renamed.fasta --coord_file $Instructions
done
The number of reads aligning from P421 to potential contaminants were identified.
Fusarium=$(ls ../fusarium/assembly/external_group/F.oxysporum_fsp_lycopersici/4287_chromosomal/ensembl/Fusarium_oxysporum_chromosome_and_additional_contigs.fa)
Stenotrophomonas=$(ls ../../../../../projects/oldhome/armita/prog/deconseq-standalone-0.4.3/database/stenotrophomonas/all_complete_GbStenotrophomonas.fasta)
for Reference in $(ls $Fusarium $Stenotrophomonas); do
for StrainPath in $(ls -d qc_dna/paired/*/* | grep '421'); do
ProgDir=/projects/oldhome/armita/git_repos/emr_repos/tools/seq_tools/assemblers/spades
Strain=$(echo $StrainPath | rev | cut -f1 -d '/' | rev)
Organism=$(echo $StrainPath | rev | cut -f2 -d '/' | rev)
F_Read=$(ls $StrainPath/F/*_trim.fq.gz)
R_Read=$(ls $StrainPath/R/*_trim.fq.gz)
echo $F_Read
echo $R_Read
Prefix=$(basename $Reference | sed 's/.fasta//g' | sed 's/.fa//g' | sed 's/.fna//g')
OutDir=analysis/genome_alignment/bowtie/$Organism/$Strain/vs_${Prefix}
ProgDir=/projects/oldhome/armita/git_repos/emr_repos/tools/seq_tools/genome_alignment
qsub $ProgDir/bowtie/sub_bowtie_unaligned.sh $Reference $F_Read $R_Read $OutDir
done
done
P241 was reassembled:
for StrainPath in $(ls -d analysis/genome_alignment/bowtie/P.*/* | grep 'P421'); do
ProgDir=/projects/oldhome/armita/git_repos/emr_repos/tools/seq_tools/assemblers/spades
Strain=$(echo $StrainPath | rev | cut -f1 -d '/' | rev)
Organism=$(echo $StrainPath | rev | cut -f2 -d '/' | rev)
F_Read=$(ls $StrainPath/vs_all_complete_GbStenotrophomonas/*.fq.1.gz)
R_Read=$(ls $StrainPath/vs_all_complete_GbStenotrophomonas/*.fq.2.gz)
OutDir=assembly/spades/$Organism/${Strain}_v2
echo $F_Read
echo $R_Read
qsub $ProgDir/submit_SPAdes_HiMem.sh $F_Read $R_Read $OutDir correct
done
Assembly results were summarised using Quast:
for Assembly in $(ls assembly/spades/*/*/filtered_contigs*/contigs_min_500bp.fasta | grep -e 'P.cactorum' -e 'P.idaei' | grep 'P421_v2'); do
Strain=$(echo $Assembly | rev | cut -f3 -d '/' | rev);
Organism=$(echo $Assembly | rev | cut -f4 -d '/' | rev);
OutDir=$(dirname $Assembly)
ProgDir=/projects/oldhome/armita/git_repos/emr_repos/tools/seq_tools/assemblers/assembly_qc/quast
qsub $ProgDir/sub_quast.sh $Assembly $OutDir;
done
for Assembly in $(ls assembly/spades/*/*/filtered_contigs/contigs_min_500bp.fasta | grep -e 'P.cactorum' -e 'P.idaei' | grep 'P421_v2'); do
Strain=$(echo $Assembly | rev | cut -f3 -d '/' | rev)
Organism=$(echo $Assembly | rev | cut -f4 -d '/' | rev)
echo "$Organism - $Strain"
# for Exclude_db in "bacillus" "delftia" "paenibacillus" "stenotrophomonas" "FoL"; do
for Exclude_db in "FoL"; do
Good_db="phytoph"
AssemblyDir=$(dirname $Assembly)
OutDir=$AssemblyDir/../deconseq_$Exclude_db
ProgDir=/projects/oldhome/armita/git_repos/emr_repos/tools/seq_tools/assemblers/assembly_qc/remove_contaminants
qsub $ProgDir/sub_deconseq.sh $Assembly $Exclude_db $Good_db $OutDir
done
done
Results were summarised using the commands:
for Exclude_db in "bacillus" "delftia" "paenibacillus" "stenotrophomonas" "FoL"; do
echo $Exclude_db
for File in $(ls assembly/spades/P.*/*/*/log.txt | grep "_${Exclude_db}" | grep 'P421_v2'); do
Name=$(echo $File | rev | cut -f3 -d '/' | rev);
Good=$(cat $File |cut -f2 | head -n1 | tail -n1);
Both=$(cat $File |cut -f2 | head -n2 | tail -n1);
Bad=$(cat $File |cut -f2 | head -n3 | tail -n1);
printf "$Name\t$Good\t$Both\t$Bad\n";
done
done
bacillus
P421_v2 7706 2 8
delftia
P421_v2 7536 2 178
paenibacillus
P421_v2 7699 2 15
stenotrophomonas
P421_v2 7058 5 653
FoL
P421_v2 7189 45 482
Those contigs that were not identified as contaminants in each of the filtering steps were retained.
for Assembly in $(ls assembly/spades/*/*/filtered_contigs/contigs_min_500bp.fasta | grep -e 'P.cactorum' -e 'P.idaei' | grep -w 'P421_v2'); do
Strain=$(echo $Assembly | rev | cut -f3 -d '/' | rev)
Organism=$(echo $Assembly | rev | cut -f4 -d '/' | rev)
echo "$Organism - $Strain"
StrainDir=$(ls -d assembly/spades/$Organism/$Strain)
mkdir -p $StrainDir/deconseq_appended
for File in $(ls $StrainDir/deconseq_*/*cont.fa | grep -e "bacillus" -e "delftia" -e "Paen" -e "stenotrophomonas" -e "FoL"); do
cat $File | grep '>'
done | sort | uniq | tr -d '>' > $StrainDir/deconseq_appended/exclude_list.txt
Instructions=$StrainDir/deconseq_appended/exclude_instructions.txt
printf "Exclude:\nSequence name, length, apparent source\n" > $Instructions
cat $StrainDir/deconseq_appended/exclude_list.txt | sed -r 's/$/\t.\tcontaminant/g' >> $Instructions
ProgDir=~/git_repos/emr_repos/tools/seq_tools/assemblers/assembly_qc/remove_contaminants
$ProgDir/remove_contaminants.py --keep_mitochondria --inp $Assembly --out $StrainDir/deconseq_appended/contigs_min_500bp_renamed.fasta --coord_file $Instructions
done
Assembly stats were collected on filtered assemblies:
for Assembly in $(ls assembly/spades/P.*/*/deconseq_appended/contigs_min_500bp_renamed.fasta | grep -e 'P.cactorum' -e 'P.idaei'| grep 'P421_v2'); do
Strain=$(echo $Assembly | rev | cut -f3 -d '/' | rev)
Organism=$(echo $Assembly | rev | cut -f4 -d '/' | rev)
OutDir=$(dirname $Assembly)
ProgDir=/projects/oldhome/armita/git_repos/emr_repos/tools/seq_tools/assemblers/assembly_qc/quast
qsub $ProgDir/sub_quast.sh $Assembly $OutDir
done
Assembly stats were summarised and compared to previous assembly results using:
# for Assembly in $(ls assembly/spades/P.*/*/deconseq_Paen/report.tsv); do
for Assembly in $(ls assembly/spades/P.*/*/deconseq_appended/report.tsv | grep -w '404'); do
Strain=$(echo $Assembly | rev | cut -f3 -d '/'| rev);
Size=$(cat $Assembly | grep 'Total length' | head -n1 | cut -f2);
OldAssembly=$(ls assembly/spades/P.*/$Strain/filtered_contigs*/report.tsv)
OldSize=$(cat $OldAssembly | grep 'Total length' | head -n1 | cut -f2);
printf "$Strain\t$Size\t$OldSize\n";
done
11-40 59726247 59731760
12420 60315503 66488881
15_13 60703463 60708976
15_7 60797060 60806527
17-21 59761502 59767015
2003_3 60800281 67797455
4032 61561706 70105079
4040 60657148 65177765
404 75508939 82472226
414 61116855 62710907
415 60677782 64884339
416 60502157 64802670
62471 61265225 62823426
P295 60707114 67293243
P421 64527586 65473778
P421_v2 60335997 64856938
PC13_15 61130682 67857120
R36_14 61656467 68350251
371 60474213 60483665
SCRP370 60582725 60561664
SCRP376 60610468 60615981
Numbers of busco genes in each assembly were identified:
for Assembly in $(ls assembly/spades/P.*/*/deconseq_Paen/contigs_min_500bp_filtered_renamed.fasta | grep -e 'P.cactorum' -e 'P.idaei'); do
# for Assembly in $(ls assembly/spades/P.*/*/deconseq_appended/contigs_min_500bp_renamed.fasta | grep -e 'P.cactorum' -e 'P.idaei'| grep 'P421_v2'); do
Strain=$(echo $Assembly| rev | cut -d '/' -f3 | rev)
Organism=$(echo $Assembly | rev | cut -d '/' -f4 | rev)
echo "$Organism - $Strain"
ProgDir=/projects/oldhome/armita/git_repos/emr_repos/tools/gene_prediction/busco
BuscoDB="Eukaryotic"
# BuscoDB="stramenopiles"
OutDir=gene_pred/busco/$Organism/$Strain/assembly
# OutDir=$(dirname $Assembly)"_stramenopiles"
qsub $ProgDir/sub_busco3.sh $Assembly $BuscoDB $OutDir
# qsub $ProgDir/sub_busco_v4.sh $Assembly $BuscoDB $OutDir
done
A Bioproject and Biosample was made with NCBI genbank for submission of genomes. Following the creation of these submissions, the .fasta assembly was uploaded through the submission portal. A note was provided requesting that the assembly be run through the contamination screen to aid a more detailed resubmission in future. The returned FCSreport.txt was downloaded from the NCBI webportal and used to correct the assembly to NCBI standards.
NCBI reports (FCSreport.txt) were manually downloaded to the following loactions:
for Assembly in $(ls assembly/spades/P.*/*/deconseq_appended/contigs_min_500bp_renamed.fasta | grep -e 'P.cactorum' -e 'P.idaei' | grep 'P421_v2'); do
Strain=$(echo $Assembly | rev | cut -f3 -d '/' | rev)
Organism=$(echo $Assembly | rev | cut -f4 -d '/' | rev)
NCBI_report_dir=genome_submission/$Organism/$Strain/initial_submission
echo "$PWD/$NCBI_report_dir"
mkdir -p $NCBI_report_dir
done
The 12-420 genome was further edited by removing contig1
Strain='12420'
Organism='P.cactorum'
NCBI_report=$(ls genome_submission/$Organism/$Strain/initial_submission/Contamination*.txt)
sed -i 's/Trim:/Exclude:/g' $NCBI_report
These downloaded files were used to correct assemblies:
for Assembly in $(ls assembly/spades/P.*/*/deconseq_appended/contigs_min_500bp_renamed.fasta | grep -e 'P.cactorum' -e 'P.idaei' | grep -e 'P421_v2'); do
Strain=$(echo $Assembly | rev | cut -f3 -d '/' | rev)
Organism=$(echo $Assembly | rev | cut -f4 -d '/' | rev)
echo "$Organism - $Strain"
NCBI_report=$(ls genome_submission/$Organism/$Strain/initial_submission/Contamination*.txt)
if [[ $NCBI_report ]]; then
echo "Contamination report found"
else
NCBI_report=genome_submission/$Organism/$Strain/initial_submission/no_edits.txt
printf "Exclude:\nSequence name, length, apparent source\n" > $NCBI_report
fi
OutDir=assembly/spades/$Organism/$Strain/ncbi_edits
mkdir -p $OutDir
ProgDir=~/git_repos/emr_repos/tools/seq_tools/assemblers/assembly_qc/remove_contaminants
$ProgDir/remove_contaminants.py --inp $Assembly --out $OutDir/contigs_min_500bp_renamed.fasta --coord_file $NCBI_report > $OutDir/log.txt
# $ProgDir/remove_contaminants.py --keep_mitochondria --inp $Assembly --out $OutDir/contigs_min_500bp_renamed.fasta --coord_file $NCBI_report > $OutDir/log.txt
done
echo "Strain P421 has further edits to be made"
NCBI_report_dir=genome_submission/P.cactorum/P421_v2/second_submission
echo "$PWD/$NCBI_report_dir"
mkdir -p $NCBI_report_dir
# Move downloaded contamination report into this directory
for Assembly in $(ls assembly/spades/*/*/ncbi_edits/contigs_min_500bp_renamed.fasta | grep -e 'P.cactorum' -e 'P.idaei' | grep 'P421_v2'); do
Strain=$(echo $Assembly | rev | cut -f3 -d '/' | rev)
Organism=$(echo $Assembly | rev | cut -f4 -d '/' | rev)
echo "$Organism - $Strain"
Round1Name=$(echo $Assembly | sed 's/contigs_min_500bp_renamed.fasta/round1.fasta/g')
mv $Assembly $Round1Name
NCBI_report=genome_submission/$Organism/$Strain/second_submission/Contamination*.txt
OutDir=assembly/spades/$Organism/$Strain/ncbi_edits
mkdir -p $OutDir
ProgDir=~/git_repos/emr_repos/tools/seq_tools/assemblers/assembly_qc/remove_contaminants
$ProgDir/remove_contaminants.py --inp $Round1Name --out $OutDir/contigs_min_500bp_renamed.fasta --coord_file $NCBI_report > $OutDir/log.txt
done
#
# NCBI_report_dir=genome_submission/P.cactorum/P421_v2/third_submission
# echo "$PWD/$NCBI_report_dir"
# mkdir -p $NCBI_report_dir
# # Move downloaded contamination report into this directory
# for Assembly in $(ls assembly/spades/*/*/ncbi_edits/contigs_min_500bp_renamed.fasta | grep -e 'P.cactorum' -e 'P.idaei' | grep 'P421_v2'); do
# Strain=$(echo $Assembly | rev | cut -f3 -d '/' | rev)
# Organism=$(echo $Assembly | rev | cut -f4 -d '/' | rev)
# echo "$Organism - $Strain"
# Round2Name=$(echo $Assembly | sed 's/contigs_min_500bp_renamed.fasta/round2.fasta/g')
# mv $Assembly $Round2Name
# NCBI_report=genome_submission/$Organism/$Strain/third_submission/Contamination*.txt
# OutDir=assembly/spades/$Organism/$Strain/ncbi_edits
# mkdir -p $OutDir
# ProgDir=~/git_repos/emr_repos/tools/seq_tools/assemblers/assembly_qc/remove_contaminants
# $ProgDir/remove_contaminants.py --inp $Round2Name --out $OutDir/contigs_min_500bp_renamed.fasta --coord_file $NCBI_report > $OutDir/log.txt
# done
for Assembly in $(ls assembly/spades/*/*/ncbi_edits/contigs_min_500bp_renamed.fasta | grep -e 'P.cactorum' -e 'P.idaei' | grep 'P421_v2'); do
Kmer=$(echo $Assembly | rev | cut -f2 -d '/' | rev);
Strain=$(echo $Assembly | rev | cut -f3 -d '/' | rev);
Organism=$(echo $Assembly | rev | cut -f4 -d '/' | rev);
OutDir=$(dirname $Assembly)
ProgDir=/projects/oldhome/armita/git_repos/emr_repos/tools/seq_tools/assemblers/assembly_qc/quast
qsub $ProgDir/sub_quast.sh $Assembly $OutDir;
ProgDir=/projects/oldhome/armita/git_repos/emr_repos/tools/gene_prediction/busco
BuscoDB="Eukaryotic"
OutDir=gene_pred/busco/$Organism/$Strain/assembly
qsub $ProgDir/sub_busco3.sh $Assembly $BuscoDB $OutDir
done
Assembly stats were summarised using:
for File in $(ls assembly/spades/P.cactorum/*/filtered_contigs/report.tsv); do
echo "$File" | rev | cut -f3 -d '/' | rev ;
cat $File | cut -f2;
done
The following assemblies were accepted by ncbi following runs through the Paenobacillus deconseq database:
Pcac_12-420
Pcac_15-13
Pcac_4032
Pcac_4040
The following were run through the Paenobacillus, Bacillus, Delftia database:
R36-14
416
415
2003_3
Repeat masking was performed and used the following programs: Repeatmasker Repeatmodeler
The best assemblies were used to perform repeatmasking
ProgDir=/projects/oldhome/armita/git_repos/emr_repos/tools/seq_tools/repeat_masking
for BestAss in $(ls assembly/spades/P.*/*/ncbi_edits/contigs_min_500bp_renamed.fasta | grep -e 'P.idaei' -e 'P.cactorum' | grep 'P421'); do
Strain=$(echo $BestAss | rev | cut -f3 -d '/' | rev)
Organism=$(echo $BestAss | rev | cut -f4 -d '/' | rev)
OutDir=repeat_masked/$Organism/"$Strain"/filtered_contigs_repmask
qsub $ProgDir/rep_modeling.sh $BestAss $OutDir
qsub $ProgDir/transposonPSI.sh $BestAss $OutDir
done
The number of bases masked by transposonPSI and Repeatmasker were summarised using the following commands:
for RepDir in $(ls -d repeat_masked/P.*/*/filtered_contigs_repmask | grep -e 'P.cactorum' -e 'P.idaei' | grep -v -e '414' -e '10300'); do
Strain=$(echo $RepDir | rev | cut -f2 -d '/' | rev)
Organism=$(echo $RepDir | rev | cut -f3 -d '/' | rev)
RepMaskGff=$(ls $RepDir/*_contigs_hardmasked.gff)
TransPSIGff=$(ls $RepDir/*_contigs_unmasked.fa.TPSI.allHits.chains.gff3)
# printf "The number of bases masked by RepeatMasker:\t"
Repmasker=$(sortBed -i $RepMaskGff | bedtools merge | awk -F'\t' 'BEGIN{SUM=0}{ SUM+=$3-$2 }END{print SUM}')
# printf "The number of bases masked by TransposonPSI:\t"
TPSI=$(sortBed -i $TransPSIGff | bedtools merge | awk -F'\t' 'BEGIN{SUM=0}{ SUM+=$3-$2 }END{print SUM}')
# printf "The total number of masked bases are:\t"
Total=$(cat $RepMaskGff $TransPSIGff | sortBed | bedtools merge | awk -F'\t' 'BEGIN{SUM=0}{ SUM+=$3-$2 }END{print SUM}')
printf "$Organism\t$Strain\t$Repmasker\t$TPSI\t$Total\n"
done
P.cactorum 11-40 13618425 3677249 14636093
P.cactorum 12420 13952648 3830927 14889294
P.cactorum 15_13 14871198 4311252 15878513
P.cactorum 15_7 14842223 4444413 15807291
P.cactorum 17-21 14153794 3637012 15026432
P.cactorum 2003_3 14869062 4312476 15913172
P.cactorum 4032 15067487 4189364 16046834
P.cactorum 4040 14914633 4118840 15860862
P.cactorum 404 12646327 4410591 13909457
P.cactorum 415 15088727 4308075 16093167
P.cactorum 416 14782441 4199154 15719961
P.cactorum 62471 14741873 4113154 15684662
P.cactorum P295 14635224 4005338 15561536
P.cactorum P421_v2 14071987 3526248 14909132
P.cactorum PC13_15 14931997 4176759 15852338
P.cactorum R36_14 15206511 4318141 16169407
P.idaei 371 15109356 4134856 16074815
P.idaei SCRP370 15417500 4122740 16384703
P.idaei SCRP376 15516157 4111645 16516659
KAT kmer spectra analysis
for Assembly in $(ls repeat_masked/*/*/filtered_contigs_repmask/*_contigs_unmasked.fa | grep -e 'P.cactorum' -e 'P.idaei' | grep -v -e '414' -e '10300'); do
Strain=$(echo $Assembly| rev | cut -d '/' -f3 | rev | sed 's/_v2//g')
Organism=$(echo $Assembly | rev | cut -d '/' -f4 | rev)
echo "$Organism - $Strain"
IlluminaDir=$(ls -d qc_dna/paired/*/$Strain)
cat $IlluminaDir/F/*_trim.fq.gz > $IlluminaDir/F/F_trim_appended.fq.gz
cat $IlluminaDir/R/*_trim.fq.gz > $IlluminaDir/R/R_trim_appended.fq.gz
ReadsF=$(ls $IlluminaDir/F/F_trim_appended.fq.gz)
ReadsR=$(ls $IlluminaDir/R/R_trim_appended.fq.gz)
OutDir=$(dirname $Assembly)/kat
Prefix="${Strain}_repeat_masked"
ProgDir=/projects/oldhome/armita/git_repos/emr_repos/tools/seq_tools/assemblers/assembly_qc/kat
qsub $ProgDir/sub_kat.sh $Assembly $ReadsF $ReadsR $OutDir $Prefix 200
done
after KAT jobs have finished running, then remove appended trimmed reads
rm ../../../../projects/oldhome/groups/harrisonlab/project_files/alternaria/qc_dna/paired/*/*/*/F_trim_appended.fq.gz
rm ../../../../projects/oldhome/groups/harrisonlab/project_files/alternaria/qc_dna/paired/*/*/*/R_trim_appended.fq.gz
Gene prediction followed three steps: Pre-gene prediction - Quality of genome assemblies were assessed using Cegma to see how many core eukaryotic genes can be identified. Gene model training - Gene models were trained using assembled RNAseq data as part of the Braker1 pipeline Gene prediction - Gene models were used to predict genes in genomes as part of the the Braker1 pipeline. This used RNAseq data as hints for gene models.
Quality of genome assemblies was assessed by looking for the gene space in the assemblies.
Busco has replaced CEGMA and was run to check gene space in assemblies
for Assembly in $(ls repeat_masked/*/*/filtered_contigs_repmask/*_contigs_unmasked.fa | grep -e 'P.cactorum' -e 'P.idaei' | grep -v -e '414'); do
# Strain=$(echo $Assembly| rev | cut -d '/' -f3 | rev)
# Organism=$(echo $Assembly | rev | cut -d '/' -f4 | rev)
Strain=$(echo $Assembly| rev | cut -d '/' -f3 | rev)
Organism=$(echo $Assembly | rev | cut -d '/' -f4 | rev)
echo "$Organism - $Strain"
OutDir=$(dirname $Assembly)
ProgDir=/projects/oldhome/armita/git_repos/emr_repos/tools/seq_tools/assemblers/assembly_qc/quast
qsub $ProgDir/sub_quast.sh $Assembly $OutDir
ProgDir=/projects/oldhome/armita/git_repos/emr_repos/tools/gene_prediction/busco
BuscoDB="Eukaryotic"
OutDir=gene_pred/busco/$Organism/$Strain/assembly
# qsub $ProgDir/sub_busco3.sh $Assembly $BuscoDB $OutDir
BuscoDB="/projects/oldhome/groups/harrisonlab/dbBusco/alveolata_stramenophiles_ensembl"
OutDir=gene_pred/busco/$Organism/$Strain/assembly
qsub $ProgDir/sub_busco3.sh $Assembly $BuscoDB $OutDir
BuscoDB="stramenopiles"
OutDir=$(dirname $Assembly)"_stramenopiles"
OutDir=gene_pred/busco_v4/$Organism/$Strain/assembly
# qsub $ProgDir/sub_busco_v4.sh $Assembly $BuscoDB $OutDir
done
cd /projects/oldhome/groups/harrisonlab/project_files/idris
conda activate fungap
for Assembly in $(ls repeat_masked/*/*/filtered_contigs_repmask/*_contigs_unmasked.fa | grep -e 'P.cactorum' -e 'P.idaei' | grep -v -e '414'); do
Strain=$(echo $Assembly| rev | cut -d '/' -f3 | rev)
Organism=$(echo $Assembly | rev | cut -d '/' -f4 | rev)
echo "$Organism - $Strain"
OutDir=$(dirname $Assembly)
ProgDir=/projects/oldhome/armita/git_repos/emr_repos/tools/gene_prediction/busco
OutDir=gene_pred/busco/$Organism/$Strain/assembly
BuscoDB="/projects/oldhome/groups/harrisonlab/dbBusco/alveolata_stramenophiles_ensembl"
OutDir=gene_pred/busco/$Organism/$Strain/assembly
sbatch $ProgDir/slurm_busco_v3.sh $Assembly $BuscoDB $OutDir
done
for File in $(ls gene_pred/busco/*/*/assembly/run_*_contigs_unmasked/short_summary_*.txt | grep -e 'P.cactorum' -e 'P.idaei'); do
Strain=$(echo $File| rev | cut -d '/' -f4 | rev)
Organism=$(echo $File | rev | cut -d '/' -f5 | rev)
Complete=$(cat $File | grep "(C)" | cut -f2)
Fragmented=$(cat $File | grep "(F)" | cut -f2)
Missing=$(cat $File | grep "(M)" | cut -f2)
Total=$(cat $File | grep "Total" | cut -f2)
echo -e "$Organism\t$Strain\t$Complete\t$Fragmented\t$Missing\t$Total"
done
P.cactorum 10300 283 3 17 303
P.cactorum 11-40 285 3 15 303
P.cactorum 12420 285 2 16 303
P.cactorum 15_13 286 2 15 303
P.cactorum 15_7 286 2 15 303
P.cactorum 17-21 286 2 15 303
P.cactorum 2003_3 286 2 15 303
P.cactorum 4032 285 3 15 303
P.cactorum 4040 286 2 15 303
P.cactorum 404 287 3 13 303
P.cactorum 414 285 2 16 303
P.cactorum 415 285 2 16 303
P.cactorum 416 286 2 15 303
P.cactorum 62471 285 3 15 303
P.cactorum P295 285 3 15 303
P.cactorum P421 285 3 15 303
P.cactorum PC13_15 286 2 15 303
P.cactorum R36_14 285 3 15 303
P.idaei 371 285 3 15 303
P.idaei SCRP370 285 3 15 303
P.idaei SCRP376 285 3 15 303
for File in $(ls repeat_masked/*/*/filtered_contigs_repmask/report.tsv | grep -e 'P.cactorum' -e 'P.idaei'); do
Strain=$(echo $File| rev | cut -d '/' -f3 | rev)
Organism=$(echo $File | rev | cut -d '/' -f4 | rev)
Contigs=$(cat $File | grep "contigs (>= 0 bp)" | cut -f2)
Length=$(cat $File | grep "Total length (>= 0 bp)" | cut -f2)
Largest=$(cat $File | grep "Largest contig" | cut -f2)
N50=$(cat $File | grep "N50" | cut -f2)
echo -e "$Organism\t$Strain\t$Contigs\t$Length\t$Largest\t$N50"
done
P.cactorum 11-40 4571 59724358 288536 43828
P.cactorum 12420 5509 59887557 272591 37498
P.cactorum 15_13 5421 60703463 358283 41248
P.cactorum 15_7 5457 60796470 300516 43376
P.cactorum 17-21 4452 59759428 347645 49722
P.cactorum 2003_3 5467 60795145 353051 40273
P.cactorum 4032 6726 61561706 299428 38202
P.cactorum 4040 5326 60657148 238735 39295
P.cactorum 404 20136 75501778 346862 36598
P.cactorum 415 5695 60677265 313236 40742
P.cactorum 416 5485 60476717 251913 40465
P.cactorum 62471 6173 61265185 285881 36408
P.cactorum P295 5088 60707037 357485 48651
P.cactorum P421_v2 6581 59952001 239266 34103
P.cactorum PC13_15 5278 61128622 346832 39779
P.cactorum R36_14 5952 61655841 345260 36465
P.idaei 371 4720 60469986 295942 46381
P.idaei SCRP370 5356 60582035 247223 39576
P.idaei SCRP376 5313 60609233 230545 39406
#Gene prediction
Gene prediction was performed for P. cactorum genomes. Two gene prediction approaches were used:
Gene prediction using Braker1 Prediction of all putative ORFs in the genome using the ORF finder (atg.pl) approach.
Gene prediction was performed using Braker1.
First, RNAseq data was aligned to Fusarium genomes.
- qc of RNA seq data was performed as part of sequencing the 10300 genome:
Maria performed alignment of RNAseq data vs the F. vesca and those reads that did not align were used for alignment vs the P.cactorum genome.
make symbolic links to timecourse data
for File in $(ls /projects/oldhome/sobczm/popgen/rnaseq/vesca_*/*.mate1.fq.gz); do
Sample=$(echo $File | rev | cut -d '/' -f2 | rev | sed 's/vesca_//g')
echo "$Sample"
OutDir=qc_rna/paired/Transcriptome_Emily_Fenella_Pcactorum-2017-04-07_no_vesca/$Sample/F
mkdir -p "$OutDir"
cp -s $File $OutDir/.
done
for File in $(ls /projects/oldhome/sobczm/popgen/rnaseq/vesca_*/*.mate2.fq.gz); do
Sample=$(echo $File | rev | cut -d '/' -f2 | rev | sed 's/vesca_//g')
echo "$Sample"
OutDir=qc_rna/paired/Transcriptome_Emily_Fenella_Pcactorum-2017-04-07_no_vesca/$Sample/R
mkdir -p "$OutDir"
cp -s $File $OutDir/.
done
Qc of RNAseq data from mycelium and in-plant infection is detailed in commands for gene prediction of the PacBio assmbly under: assembly/pacbio_assembly.md
Performed alignment of RNAseq data vs the F. vesca genome and those reads that did not align were used for alignment vs the P.cactorum genome.
Aligning mycelium data
for Assembly in $(ls repeat_masked/*/*/filtered_contigs_repmask/*_contigs_unmasked.fa | grep -e 'P.cactorum' -e 'P.idaei' | grep -v -e '414' -e '10300'); do
Strain=$(echo $Assembly| rev | cut -d '/' -f3 | rev)
Organism=$(echo $Assembly | rev | cut -d '/' -f4 | rev)
echo "$Organism - $Strain"
for RNADir in $(ls -d ../../../../../data/scratch/armita/idris/alignment/star/fvesca/v1.1/*/* | grep 'mycelium'); do
Jobs=$(qstat | grep 'sub_sta' | grep 'qw'| wc -l)
while [ $Jobs -gt 1 ]; do
sleep 1m
printf "."
Jobs=$(qstat | grep 'sub_sta' | grep 'qw'| wc -l)
done
printf "\n"
FileF=$(ls $RNADir/*.mate1.fq.gz)
FileR=$(ls $RNADir/*.mate2.fq.gz)
echo $FileF
echo $FileR
Prefix=$(echo $RNADir | rev | cut -f1 -d '/' | rev)
Timepoint=$(echo $RNADir | rev | cut -f2 -d '/' | rev)
echo "$Timepoint"
OutDir=../../../../../data/scratch/armita/idris/alignment/star/$Organism/$Strain/$Timepoint/$Prefix
ProgDir=/projects/oldhome/armita/git_repos/emr_repos/tools/seq_tools/RNAseq
qsub $ProgDir/sub_star.sh $Assembly $FileF $FileR $OutDir
done
done
Aligning timecourse data
for Assembly in $(ls repeat_masked/*/*/filtered_contigs_repmask/*_contigs_unmasked.fa | grep -e 'P.cactorum' -e 'P.idaei' | grep -v -e '414' -e '10300'); do
Strain=$(echo $Assembly| rev | cut -d '/' -f3 | rev)
Organism=$(echo $Assembly | rev | cut -d '/' -f4 | rev)
echo "$Organism - $Strain"
for RNADir in $(ls -d qc_rna/paired/*/* | grep -e '_no_vesca'); do
FileNum=$(ls $RNADir/F/*.mate1.fq.gz | wc -l)
Jobs=$(qstat | grep 'sub_sta' | grep 'qw'| wc -l)
for num in $(seq 1 $FileNum); do
while [ $Jobs -gt 1 ]; do
sleep 1m
printf "."
Jobs=$(qstat | grep 'sub_sta' | grep 'qw'| wc -l)
done
printf "\n"
FileF=$(ls $RNADir/F/*.mate1.fq.gz | head -n $num | tail -n1)
FileR=$(ls $RNADir/R/*.mate2.fq.gz | head -n $num | tail -n1)
echo $FileF
echo $FileR
Prefix=$(echo $RNADir | rev | cut -f1 -d '/' | rev)
Jobs=$(qstat | grep 'sub_sta' | grep 'qw'| wc -l)
Timepoint=$(echo $RNADir | rev | cut -f1 -d '/' | rev)
echo "$Timepoint"
OutDir=../../../../../data/scratch/armita/idris/alignment/star/$Organism/$Strain/$Timepoint/$Prefix
ProgDir=/projects/oldhome/armita/git_repos/emr_repos/tools/seq_tools/RNAseq
qsub $ProgDir/sub_star.sh $Assembly $FileF $FileR $OutDir
done
done
done
Alignments were concatenated prior to gene prediction
qlogin -pe smp 8
cd /projects/oldhome/groups/harrisonlab/project_files/idris
for StrainDir in $(ls -d qc_dna/paired/P.*/* | grep -v -w -e '411' -e '10300_old' | grep -e 'cactorum' -e 'idaei' | grep -e '11-40' -e '17-21' -e 'P421'); do
echo $StrainDir
BamFiles=$(ls $StrainDir/*/*/star_aligmentAligned.sortedByCoord.out.bam | grep -v -e 'PRO1467_S1_' -e 'PRO1467_S2_' -e 'PRO1467_S3_' -e 'PRO1467_S10_' -e 'PRO1467_S11_' -e 'PRO1467_S12_' | tr -d '\n' | sed 's/.bam/.bam /g')
OutDir=$StrainDir/concatenated
mkdir -p $OutDir
samtools merge -@ 8 -f $OutDir/concatenated.bam $BamFiles
done
logout
Before braker predictiction was performed, I double checked that I had the genemark key in my user area and copied it over from the genemark install directory:
ls ~/.gm_key
cp /projects/oldhome/armita/prog/genemark/gm_key_64 ~/.gm_key
for Assembly in $(ls repeat_masked/*/*/filtered_contigs_repmask/*_contigs_softmasked_repeatmasker_TPSI_appended.fa | grep -e 'P.cactorum' -e 'P.idaei' | grep -v -e '414' -e '10300'); do
Strain=$(echo $Assembly| rev | cut -d '/' -f3 | rev)
Organism=$(echo $Assembly | rev | cut -d '/' -f4 | rev)
echo "$Organism - $Strain"
mkdir -p alignment/$Organism/$Strain/concatenated
OutDir=gene_pred/braker/$Organism/"$Strain"_braker
AcceptedHits=$(ls ../../../../../data/scratch/armita/idris/alignment/$Organism/$Strain/concatenated/concatenated.bam)
GeneModelName="$Organism"_"$Strain"_braker
rm -r /projects/oldhome/armita/prog/augustus-3.1/config/species/"$Organism"_"$Strain"_braker
ProgDir=/projects/oldhome/armita/git_repos/emr_repos/tools/gene_prediction/braker1
qsub $ProgDir/sub_braker.sh $Assembly $OutDir $AcceptedHits $GeneModelName
done
Additional genes were added to Braker gene predictions, using CodingQuary in pathogen mode to predict additional regions.
Firstly, aligned RNAseq data was assembled into transcripts using Cufflinks.
Note - cufflinks doesn't always predict direction of a transcript and therefore features can not be restricted by strand when they are intersected.
for Assembly in $(ls repeat_masked/*/*/filtered_contigs_repmask/*_contigs_softmasked_repeatmasker_TPSI_appended.fa | grep -e 'P.cactorum' -e 'P.idaei' | grep -v -e '414' -e '10300'); do
Strain=$(echo $Assembly| rev | cut -d '/' -f3 | rev)
Organism=$(echo $Assembly | rev | cut -d '/' -f4 | rev)
Jobs=$(qstat | grep 'sub_cuff' | grep 'qw'| wc -l)
while [ $Jobs -gt 1 ]; do
sleep 1m
printf "."
Jobs=$(qstat | grep 'sub_cuff' | grep 'qw'| wc -l)
done
printf "\n"
echo "$Organism - $Strain"
OutDir=gene_pred/cufflinks/$Organism/$Strain/concatenated
mkdir -p $OutDir
AcceptedHits=$(ls ../../../../../data/scratch/armita/idris/alignment/star/$Organism/$Strain/concatenated/concatenated.bam)
ProgDir=/projects/oldhome/armita/git_repos/emr_repos/tools/seq_tools/RNAseq
qsub $ProgDir/sub_cufflinks.sh $AcceptedHits $OutDir
done
Secondly, genes were predicted using CodingQuary:
for Assembly in $(ls repeat_masked/*/*/filtered_contigs_repmask/*_contigs_softmasked_repeatmasker_TPSI_appended.fa | grep -e 'P.cactorum' -e 'P.idaei' | grep -v -e '414' -e '10300' | grep 'idaei'); do
Strain=$(echo $Assembly| rev | cut -d '/' -f3 | rev)
Organism=$(echo $Assembly | rev | cut -d '/' -f4 | rev)
echo "$Organism - $Strain"
Jobs=$(qstat | grep 'sub_Cod' | grep 'qw'| wc -l)
while [ $Jobs -gt 1 ]; do
sleep 1m
printf "."
Jobs=$(qstat | grep 'sub_Cod' | grep 'qw'| wc -l)
done
printf "\n"
OutDir=gene_pred/codingquary/$Organism/$Strain
CufflinksGTF=$(ls gene_pred/cufflinks/$Organism/$Strain/concatenated/transcripts.gtf)
ProgDir=/projects/oldhome/armita/git_repos/emr_repos/tools/gene_prediction/codingquary
qsub $ProgDir/sub_CodingQuary.sh $Assembly $CufflinksGTF $OutDir
done
Then, additional transcripts were added to Braker gene models, when CodingQuary genes were predicted in regions of the genome, not containing Braker gene models:
for BrakerGff in $(ls gene_pred/braker/P.*/*_braker/*/augustus.gff3 | grep -e 'P.cactorum' -e 'P.idaei' | grep -v -e '414' -e '10300'); do
Strain=$(echo $BrakerGff| rev | cut -d '/' -f3 | rev | sed 's/_braker_new//g' | sed 's/_braker_pacbio//g' | sed 's/_braker//g')
Organism=$(echo $BrakerGff | rev | cut -d '/' -f4 | rev)
echo "$Organism - $Strain"
Assembly=$(ls repeat_masked/$Organism/$Strain/filtered_contigs_repmask/"$Strain"_contigs_softmasked_repeatmasker_TPSI_appended.fa)
CodingQuaryGff=gene_pred/codingquary/$Organism/$Strain/out/PredictedPass.gff3
PGNGff=gene_pred/codingquary/$Organism/$Strain/out/PGN_predictedPass.gff3
AddDir=gene_pred/codingquary/$Organism/$Strain/additional
FinalDir=gene_pred/codingquary/$Organism/$Strain/final
AddGenesList=$AddDir/additional_genes.txt
AddGenesGff=$AddDir/additional_genes.gff
FinalGff=$AddDir/combined_genes.gff
mkdir -p $AddDir
mkdir -p $FinalDir
bedtools intersect -v -a $CodingQuaryGff -b $BrakerGff | grep 'gene'| cut -f2 -d'=' | cut -f1 -d';' > $AddGenesList
bedtools intersect -v -a $PGNGff -b $BrakerGff | grep 'gene'| cut -f2 -d'=' | cut -f1 -d';' >> $AddGenesList
ProgDir=/projects/oldhome/armita/git_repos/emr_repos/tools/seq_tools/feature_annotation
$ProgDir/gene_list_to_gff.pl $AddGenesList $CodingQuaryGff CodingQuarry_v2.0 ID CodingQuary > $AddGenesGff
$ProgDir/gene_list_to_gff.pl $AddGenesList $PGNGff PGNCodingQuarry_v2.0 ID CodingQuary >> $AddGenesGff
ProgDir=/projects/oldhome/armita/git_repos/emr_repos/tools/gene_prediction/codingquary
# -
# This section is edited
$ProgDir/add_CodingQuary_features.pl $AddGenesGff $Assembly > $AddDir/add_genes_CodingQuary_unspliced.gff3
$ProgDir/correct_CodingQuary_splicing.py --inp_gff $AddDir/add_genes_CodingQuary_unspliced.gff3 > $FinalDir/final_genes_CodingQuary.gff3
# -
$ProgDir/gff2fasta.pl $Assembly $FinalDir/final_genes_CodingQuary.gff3 $FinalDir/final_genes_CodingQuary
cp $BrakerGff $FinalDir/final_genes_Braker.gff3
$ProgDir/gff2fasta.pl $Assembly $FinalDir/final_genes_Braker.gff3 $FinalDir/final_genes_Braker
cat $FinalDir/final_genes_Braker.pep.fasta $FinalDir/final_genes_CodingQuary.pep.fasta | sed -r 's/\*/X/g' > $FinalDir/final_genes_combined.pep.fasta
cat $FinalDir/final_genes_Braker.cdna.fasta $FinalDir/final_genes_CodingQuary.cdna.fasta > $FinalDir/final_genes_combined.cdna.fasta
cat $FinalDir/final_genes_Braker.gene.fasta $FinalDir/final_genes_CodingQuary.gene.fasta > $FinalDir/final_genes_combined.gene.fasta
cat $FinalDir/final_genes_Braker.upstream3000.fasta $FinalDir/final_genes_CodingQuary.upstream3000.fasta > $FinalDir/final_genes_combined.upstream3000.fasta
GffBraker=$FinalDir/final_genes_CodingQuary.gff3
GffQuary=$FinalDir/final_genes_Braker.gff3
GffAppended=$FinalDir/final_genes_appended.gff3
cat $GffBraker $GffQuary > $GffAppended
done
The final number of genes per isolate was observed using:
for DirPath in $(ls -d gene_pred/codingquary/P.*/*/final | grep -e 'P.cactorum' -e 'P.idaei' | grep -v -e '414' -e '10300'); do
Strain=$(echo $DirPath| rev | cut -d '/' -f2 | rev)
Organism=$(echo $DirPath | rev | cut -d '/' -f3 | rev)
echo "$Organism - $Strain"
cat $DirPath/final_genes_Braker.pep.fasta | grep '>' | wc -l;
cat $DirPath/final_genes_CodingQuary.pep.fasta | grep '>' | wc -l;
cat $DirPath/final_genes_combined.pep.fasta | grep '>' | wc -l;
echo "";
done
The number of genes predicted by Braker, supplimented by CodingQuary and in the final combined dataset was shown:
P.cactorum - 11-40
18162
10665
28827
P.cactorum - 12420
18079
8152
26231
P.cactorum - 15_13
18008
8736
26744
P.cactorum - 15_7
17719
8885
26604
P.cactorum - 17-21
17859
10718
28577
P.cactorum - 2003_3
18241
11545
29786
P.cactorum - 4032
17790
9029
26819
P.cactorum - 4040
18098
11432
29530
P.cactorum - 404
23148
12618
35766
P.cactorum - 415
17932
8864
26796
P.cactorum - 416
17883
11508
29391
P.cactorum - 62471
18214
11382
29596
P.cactorum - P295
17776
8551
26327
P.cactorum - P421_v2
17819
8305
26124
P.cactorum - PC13_15
18147
8460
26607
P.cactorum - R36_14
18432
11497
29929
P.idaei - 371
17190
8197
25387
P.idaei - SCRP370
17291
10591
27882
P.idaei - SCRP376
17429
8061
25490
Open reading frame predictions were made using the atg.pl script as part of the path_pipe.sh pipeline. This pipeline also identifies open reading frames containing Signal peptide sequences and RxLRs. This pipeline was run with the following commands:
ProgDir=/projects/oldhome/armita/git_repos/emr_repos/tools/gene_prediction/ORF_finder
for Genome in $(ls repeat_masked/P.*/*/filtered_contigs_repmask/*_contigs_unmasked.fa | grep -e 'P.cactorum' -e 'P.idaei' | grep -v -e '414' -e '10300'); do
echo "$Genome"
qsub $ProgDir/run_ORF_finder.sh $Genome
done
The Gff files from the the ORF finder are not in true Gff3 format. These were corrected using the following commands:
ProgDir=~/git_repos/emr_repos/tools/seq_tools/feature_annotation
for OrfGff in $(ls gene_pred/ORF_finder/P.*/*/*_ORF.gff | grep -e 'P.cactorum' -e 'P.idaei' | grep -v -e '414' -e '10300'); do
echo "$OrfGff"
OrfGffMod=$(echo $OrfGff | sed 's/.gff/.gff3/g')
$ProgDir/gff_corrector.pl $OrfGff > $OrfGffMod
done
#Genomic analysis
Putative RxLR genes were identified within Augustus gene models using a number of approaches:
- A) From Augustus gene models - Signal peptide & RxLR motif
- B) From Augustus gene models - Hmm evidence of WY domains
- C) From Augustus gene models - Hmm evidence of RxLR effectors
- D) From Augustus gene models - Hmm evidence of CRN effectors
Required programs:
- SigP
- biopython
Proteins that were predicted to contain signal peptides were identified using the following commands:
for Proteome in $(ls gene_pred/codingquary/*/*/*/final_genes_combined.pep.fasta | grep -e 'P.cactorum' -e 'P.idaei' | grep -v -e '414' -e '10300' | grep -v 'PC13_15'); do
SplitfileDir=/projects/oldhome/armita/git_repos/emr_repos/tools/seq_tools/feature_annotation/signal_peptides
ProgDir=/projects/oldhome/armita/git_repos/emr_repos/tools/seq_tools/feature_annotation/signal_peptides
Strain=$(echo $Proteome | rev | cut -f3 -d '/' | rev)
Organism=$(echo $Proteome | rev | cut -f4 -d '/' | rev)
SplitDir=gene_pred/final_split/$Organism/$Strain
mkdir -p $SplitDir
BaseName="$Organism""_$Strain"_final
$SplitfileDir/splitfile_500.py --inp_fasta $Proteome --out_dir $SplitDir --out_base $BaseName
for File in $(ls $SplitDir/*_final_*); do
Jobs=$(qstat | grep 'pred_sigP' | grep -w 'qw'| wc -l)
while [ $Jobs -gt 4 ]; do
sleep 1
printf "."
Jobs=$(qstat | grep 'pred_sigP' | grep -w 'qw'| wc -l)
done
printf "\n"
echo $File
# qsub $ProgDir/pred_sigP.sh $File
qsub $ProgDir/pred_sigP.sh $File signalp-3.0
# qsub $ProgDir/pred_sigP.sh $File signalp-4.1
done
done
Problems with SignalP3 prediction of isolate PC13_15 were found to be associate with the length of the isolate name. The name was shortened for prediction and will be renamed again when results are concatenated.
for Proteome in $(ls gene_pred/codingquary/*/*/*/final_genes_combined.pep.fasta | grep -e 'P.cactorum' -e 'P.idaei' | grep -v -e '414' -e '10300' | grep 'PC13'); do
SplitfileDir=/projects/oldhome/armita/git_repos/emr_repos/tools/seq_tools/feature_annotation/signal_peptides
ProgDir=/projects/oldhome/armita/git_repos/emr_repos/tools/seq_tools/feature_annotation/signal_peptides
Strain=$(echo $Proteome | rev | cut -f3 -d '/' | rev | sed 's/_15//g')
Organism=$(echo $Proteome | rev | cut -f4 -d '/' | rev)
SplitDir=gene_pred/final_split/$Organism/$Strain
mkdir -p $SplitDir
BaseName="$Organism""_$Strain"_final
$SplitfileDir/splitfile_500.py --inp_fasta $Proteome --out_dir $SplitDir --out_base $BaseName
for File in $(ls $SplitDir/*_final_*); do
Jobs=$(qstat | grep 'pred_sigP' | grep -w 'qw'| wc -l)
while [ $Jobs -gt 4 ]; do
sleep 1
printf "."
Jobs=$(qstat | grep 'pred_sigP' | grep -w 'qw'| wc -l)
done
printf "\n"
echo $File
qsub $ProgDir/pred_sigP.sh $File
qsub $ProgDir/pred_sigP.sh $File signalp-3.0
qsub $ProgDir/pred_sigP.sh $File signalp-4.1
done
done
The batch files of predicted secreted proteins needed to be combined into a single file for each strain. This was done with the following commands:
for SplitDir in $(ls -d gene_pred/final_split/P.*/* | grep -e 'P.cactorum' -e 'P.idaei' | grep -v -e '414' -e '10300' | grep -v PC13); do
Strain=$(echo $SplitDir | cut -d '/' -f4)
Organism=$(echo $SplitDir | cut -d '/' -f3)
echo "$Organism - $Strain"
InStringAA=''
InStringNeg=''
InStringTab=''
InStringTxt=''
for SigpDir in $(ls -d gene_pred/final_sig* | cut -f2 -d'/'); do
for GRP in $(ls -l $SplitDir/*_final_*.fa | rev | cut -d '_' -f1 | rev | sort -n); do
InStringAA="$InStringAA gene_pred/$SigpDir/$Organism/$Strain/split/"$Organism"_"$Strain"_final_$GRP""_sp.aa";
InStringNeg="$InStringNeg gene_pred/$SigpDir/$Organism/$Strain/split/"$Organism"_"$Strain"_final_$GRP""_sp_neg.aa";
InStringTab="$InStringTab gene_pred/$SigpDir/$Organism/$Strain/split/"$Organism"_"$Strain"_final_$GRP""_sp.tab";
InStringTxt="$InStringTxt gene_pred/$SigpDir/$Organism/$Strain/split/"$Organism"_"$Strain"_final_$GRP""_sp.txt";
done
cat $InStringAA > gene_pred/$SigpDir/$Organism/$Strain/"$Strain"_aug_sp.aa
cat $InStringNeg > gene_pred/$SigpDir/$Organism/$Strain/"$Strain"_aug_neg_sp.aa
tail -n +2 -q $InStringTab > gene_pred/$SigpDir/$Organism/$Strain/"$Strain"_aug_sp.tab
cat $InStringTxt > gene_pred/$SigpDir/$Organism/$Strain/"$Strain"_aug_sp.txt
done
done
for SplitDir in $(ls -d gene_pred/final_split/P.*/* | grep -e 'P.cactorum' -e 'P.idaei' | grep -v -e '414' -e '10300' | grep -w 'PC13'); do
Strain=$(echo $SplitDir | cut -d '/' -f4)
Organism=$(echo $SplitDir | cut -d '/' -f3)
echo "$Organism - $Strain"
InStringAA=''
InStringNeg=''
InStringTab=''
InStringTxt=''
for SigpDir in $(ls -d gene_pred/final_sig* | cut -f2 -d'/'); do
for GRP in $(ls -l $SplitDir/*_final_*.fa | rev | cut -d '_' -f1 | rev | sort -n); do
InStringAA="$InStringAA gene_pred/$SigpDir/$Organism/$Strain/split/"$Organism"_"$Strain"_final_$GRP""_sp.aa";
InStringNeg="$InStringNeg gene_pred/$SigpDir/$Organism/$Strain/split/"$Organism"_"$Strain"_final_$GRP""_sp_neg.aa";
InStringTab="$InStringTab gene_pred/$SigpDir/$Organism/$Strain/split/"$Organism"_"$Strain"_final_$GRP""_sp.tab";
InStringTxt="$InStringTxt gene_pred/$SigpDir/$Organism/$Strain/split/"$Organism"_"$Strain"_final_$GRP""_sp.txt";
done
cat $InStringAA > gene_pred/$SigpDir/$Organism/${Strain}_15/${Strain}_15_aug_sp.aa
cat $InStringNeg > gene_pred/$SigpDir/$Organism/${Strain}_15/${Strain}_15_aug_neg_sp.aa
tail -n +2 -q $InStringTab > gene_pred/$SigpDir/$Organism/${Strain}_15/${Strain}_15_aug_sp.tab
cat $InStringTxt > gene_pred/$SigpDir/$Organism/${Strain}_15/${Strain}_15aug_sp.txt
done
done
Secreted proteins were also predicted using Phobius
for Proteome in $(ls gene_pred/codingquary/*/*/*/final_genes_combined.pep.fasta | grep -e 'P.cactorum' -e 'P.idaei' | grep -v -e '414' -e '10300'); do
Strain=$(echo $Proteome | rev | cut -f3 -d '/' | rev)
Organism=$(echo $Proteome | rev | cut -f4 -d '/' | rev)
echo "$Organism - $Strain"
OutDir=analysis/phobius/$Organism/$Strain
mkdir -p $OutDir
phobius.pl $Proteome > $OutDir/"$Strain"_phobius.txt
ProgDir=/projects/oldhome/armita/git_repos/emr_repos/tools/seq_tools/feature_annotation/signal_peptides
$ProgDir/phobius_parser.py --inp_fasta $Proteome --phobius_txt $OutDir/"$Strain"_phobius.txt --out_fasta $OutDir/"$Strain"_phobius.fa
cat $OutDir/"$Strain"_phobius.fa | grep '>' | cut -f1 | tr -d '>' > $OutDir/"$Strain"_phobius_headers.txt
done
Secreted proteins from different sources were combined into a single file:
for Proteome in $(ls gene_pred/codingquary/*/*/*/final_genes_combined.pep.fasta | grep -e 'P.cactorum' -e 'P.idaei' | grep -v -e '414' -e '10300'); do
Strain=$(echo $Proteome | rev | cut -f3 -d '/' | rev)
Organism=$(echo $Proteome | rev | cut -f4 -d '/' | rev)
echo "$Organism - $Strain"
OutDir=gene_pred/combined_sigP/$Organism/$Strain
mkdir -p $OutDir
echo "The following number of sequences were predicted as secreted:"
cat gene_pred/final_sig*/$Organism/$Strain/*_aug_sp.aa analysis/phobius/$Organism/$Strain/"$Strain"_phobius.fa > $OutDir/"$Strain"_all_secreted.fa
cat $OutDir/"$Strain"_all_secreted.fa | grep '>' | wc -l
echo "This represented the following number of unique genes:"
cat gene_pred/final_sig*/$Organism/$Strain/*_aug_sp.aa analysis/phobius/$Organism/$Strain/"$Strain"_phobius.fa | grep '>' | cut -f1 | tr -d ' >' | sort -g | uniq > $OutDir/"$Strain"_secreted.txt
ProgDir=/projects/oldhome/armita/git_repos/emr_repos/tools/gene_prediction/ORF_finder
$ProgDir/extract_from_fasta.py --fasta $Proteome --headers $OutDir/"$Strain"_secreted.txt > $OutDir/"$Strain"_secreted.fa
cat $OutDir/"$Strain"_secreted.fa | grep '>' | wc -l
done
P.cactorum - 11-40
The following number of sequences were predicted as secreted:
16631
This represented the following number of unique genes:
3273
P.cactorum - 12420
The following number of sequences were predicted as secreted:
16145
This represented the following number of unique genes:
3139
P.cactorum - 15_13
The following number of sequences were predicted as secreted:
16187
This represented the following number of unique genes:
3142
P.cactorum - 15_7
The following number of sequences were predicted as secreted:
16028
This represented the following number of unique genes:
3110
P.cactorum - 17-21
The following number of sequences were predicted as secreted:
17022
This represented the following number of unique genes:
3320
P.cactorum - 2003_3
The following number of sequences were predicted as secreted:
16784
This represented the following number of unique genes:
3289
P.cactorum - 4032
The following number of sequences were predicted as secreted:
16284
This represented the following number of unique genes:
3172
P.cactorum - 4040
The following number of sequences were predicted as secreted:
16816
This represented the following number of unique genes:
3310
P.cactorum - 404
The following number of sequences were predicted as secreted:
19599
This represented the following number of unique genes:
3924
P.cactorum - 415
The following number of sequences were predicted as secreted:
17325
This represented the following number of unique genes:
3166
P.cactorum - 416
The following number of sequences were predicted as secreted:
17468
This represented the following number of unique genes:
3283
P.cactorum - 62471
The following number of sequences were predicted as secreted:
17048
This represented the following number of unique genes:
3354
P.cactorum - P295
The following number of sequences were predicted as secreted:
16146
This represented the following number of unique genes:
3117
P.cactorum - P421_v2
The following number of sequences were predicted as secreted:
7065
This represented the following number of unique genes:
2972
P.cactorum - PC13_15
The following number of sequences were predicted as secreted:
16328
This represented the following number of unique genes:
3168
P.cactorum - R36_14
The following number of sequences were predicted as secreted:
17028
This represented the following number of unique genes:
3379
P.idaei - 371
The following number of sequences were predicted as secreted:
15536
This represented the following number of unique genes:
2918
P.idaei - SCRP370
The following number of sequences were predicted as secreted:
15635
This represented the following number of unique genes:
3104
P.idaei - SCRP376
The following number of sequences were predicted as secreted:
14942
This represented the following number of unique genes:
2936
Some proteins that are incorporated into the cell membrane require secretion. Therefore proteins with a transmembrane domain are not likely to represent cytoplasmic or apoplastic effectors.
Proteins containing a transmembrane domain were identified:
for Proteome in $(ls gene_pred/codingquary/*/*/*/final_genes_combined.pep.fasta | grep -e 'P.cactorum' -e 'P.idaei' | grep -v -e '414' -e '10300'); do
Strain=$(echo $Proteome | rev | cut -f3 -d '/' | rev)
Organism=$(echo $Proteome | rev | cut -f4 -d '/' | rev)
ProgDir=/projects/oldhome/armita/git_repos/emr_repos/tools/seq_tools/feature_annotation/transmembrane_helices
qsub $ProgDir/submit_TMHMM.sh $Proteome
done
Those proteins with transmembrane domains were removed from lists of Signal peptide containing proteins
for File in $(ls gene_pred/trans_mem/*/*/*_TM_genes_neg.txt | grep -v -e '414' -e '10300'); do
Strain=$(echo $File | rev | cut -f2 -d '/' | rev)
Organism=$(echo $File | rev | cut -f3 -d '/' | rev)
echo "$Organism - $Strain"
NonTmHeaders=$(echo "$File" | sed 's/neg.txt/neg_headers.txt/g')
cat $File | cut -f1 > $NonTmHeaders
SigP=$(ls gene_pred/combined_sigP/$Organism/$Strain/"$Strain"_secreted.fa)
OutDir=$(dirname $SigP)
ProgDir=/projects/oldhome/armita/git_repos/emr_repos/tools/gene_prediction/ORF_finder
$ProgDir/extract_from_fasta.py --fasta $SigP --headers $NonTmHeaders > $OutDir/"$Strain"_final_sp_no_trans_mem.aa
echo "Number of SigP proteins:"
cat $SigP | grep '>' | wc -l
echo "Number without transmembrane domains:"
cat $OutDir/"$Strain"_final_sp_no_trans_mem.aa | grep '>' | wc -l
echo "Number of gene models:"
cat $OutDir/"$Strain"_final_sp_no_trans_mem.aa | grep '>' | cut -f1 -d't' | sort | uniq |wc -l
# A text file was also made containing headers of proteins testing +ve
PosFile=$(ls gene_pred/trans_mem/$Organism/$Strain/"$Strain"_TM_genes_pos.txt)
TmHeaders=$(echo $PosFile | sed 's/.txt/_headers.txt/g')
cat $PosFile | cut -f1 > $TmHeaders
done
The regular expression R.LR.{,40}[ED][ED][KR] has previously been used to identify RxLR effectors. The addition of an EER motif is significant as it has been shown as required for host uptake of the protein.
The RxLR_EER_regex_finder.py script was used to search for this regular expression and annotate the EER domain where present.
for Secretome in $(ls gene_pred/combined_sigP/*/*/*_all_secreted.fa | grep -e 'P.cactorum' -e 'P.idaei' | grep -v -e '414' -e '10300'); do
Strain=$(echo $Secretome | rev | cut -d '/' -f2 | rev);
Organism=$(echo $Secretome | rev | cut -d '/' -f3 | rev) ;
Proteome=$(ls gene_pred/codingquary/$Organism/$Strain/*/final_genes_combined.pep.fasta)
Gff=$(ls gene_pred/*/$Organism/$Strain/final/final_genes_appended.gff3)
OutDir=analysis/RxLR_effectors/RxLR_EER_regex_finder/"$Organism"/"$Strain";
mkdir -p $OutDir;
printf "\nstrain: $Strain\tspecies: $Organism\n";
printf "the total number of SigP gene is:\t";
cat $Secretome | grep '>' | wc -l;
printf "the number of unique SigP gene is:\t";
cat $Secretome | grep '>' | cut -f1 | tr -d ' '| sort | uniq | wc -l;
printf "the number of SigP-RxLR genes are:\t";
ProgDir=~/git_repos/emr_repos/tools/pathogen/RxLR_effectors
$ProgDir/RxLR_EER_regex_finder.py $Secretome > $OutDir/"$Strain"_all_secreted_RxLR_regex.fa;
cat $OutDir/"$Strain"_all_secreted_RxLR_regex.fa | grep '>' | cut -f1 | tr -d '>' | tr -d ' ' | sort -g | uniq > $OutDir/"$Strain"_RxLR_regex.txt
cat $OutDir/"$Strain"_RxLR_regex.txt | wc -l
ProgDir=/projects/oldhome/armita/git_repos/emr_repos/tools/gene_prediction/ORF_finder
$ProgDir/extract_from_fasta.py --fasta $Proteome --headers $OutDir/"$Strain"_RxLR_regex.txt > $OutDir/"$Strain"_RxLR_EER_regex.fa
printf "the number of SigP-RxLR-EER genes are:\t";
cat $OutDir/"$Strain"_all_secreted_RxLR_regex.fa | grep '>' | grep 'EER_motif_start' | cut -f1 | tr -d '>' | tr -d ' ' | sort -g | uniq > $OutDir/"$Strain"_RxLR_EER_regex.txt
cat $OutDir/"$Strain"_RxLR_EER_regex.txt | wc -l
$ProgDir/extract_from_fasta.py --fasta $Proteome --headers $OutDir/"$Strain"_RxLR_EER_regex.txt > $OutDir/"$Strain"_RxLR_EER_regex.fa
printf "\n"
ProgDir=/projects/oldhome/armita/git_repos/emr_repos/tools/seq_tools/feature_annotation
sed -i -r 's/\.t.*//' $OutDir/"$Strain"_RxLR_regex.txt
sed -i -r 's/\.t.*//' $OutDir/"$Strain"_RxLR_EER_regex.txt
cat $Gff | grep -w -f $OutDir/"$Strain"_RxLR_regex.txt> $OutDir/"$Strain"_RxLR_regex.gff3
cat $Gff | grep -w -f $OutDir/"$Strain"_RxLR_EER_regex.txt > $OutDir/"$Strain"_RxLR_EER_regex.gff3
done
strain: 11-40 species: P.cactorum
the total number of SigP gene is: 16631
the number of unique SigP gene is: 3273
the number of SigP-RxLR genes are: 279
the number of SigP-RxLR-EER genes are: 114
strain: 12420 species: P.cactorum
the total number of SigP gene is: 16145
the number of unique SigP gene is: 3139
the number of SigP-RxLR genes are: 273
the number of SigP-RxLR-EER genes are: 114
strain: 15_13 species: P.cactorum
the total number of SigP gene is: 16187
the number of unique SigP gene is: 3142
the number of SigP-RxLR genes are: 269
the number of SigP-RxLR-EER genes are: 112
strain: 15_7 species: P.cactorum
the total number of SigP gene is: 16028
the number of unique SigP gene is: 3110
the number of SigP-RxLR genes are: 266
the number of SigP-RxLR-EER genes are: 105
strain: 17-21 species: P.cactorum
the total number of SigP gene is: 17022
the number of unique SigP gene is: 3320
the number of SigP-RxLR genes are: 295
the number of SigP-RxLR-EER genes are: 131
strain: 2003_3 species: P.cactorum
the total number of SigP gene is: 16784
the number of unique SigP gene is: 3289
the number of SigP-RxLR genes are: 289
the number of SigP-RxLR-EER genes are: 119
strain: 4032 species: P.cactorum
the total number of SigP gene is: 16284
the number of unique SigP gene is: 3172
the number of SigP-RxLR genes are: 276
the number of SigP-RxLR-EER genes are: 112
strain: 4040 species: P.cactorum
the total number of SigP gene is: 16816
the number of unique SigP gene is: 3310
the number of SigP-RxLR genes are: 284
the number of SigP-RxLR-EER genes are: 119
strain: 404 species: P.cactorum
the total number of SigP gene is: 19599
the number of unique SigP gene is: 3924
the number of SigP-RxLR genes are: 296
the number of SigP-RxLR-EER genes are: 118
strain: 415 species: P.cactorum
the total number of SigP gene is: 17325
the number of unique SigP gene is: 3397
the number of SigP-RxLR genes are: 299
the number of SigP-RxLR-EER genes are: 120
strain: 416 species: P.cactorum
the total number of SigP gene is: 17468
the number of unique SigP gene is: 3469
the number of SigP-RxLR genes are: 306
the number of SigP-RxLR-EER genes are: 122
strain: 62471 species: P.cactorum
the total number of SigP gene is: 17048
the number of unique SigP gene is: 3354
the number of SigP-RxLR genes are: 292
the number of SigP-RxLR-EER genes are: 133
strain: P295 species: P.cactorum
the total number of SigP gene is: 16146
the number of unique SigP gene is: 3117
the number of SigP-RxLR genes are: 270
the number of SigP-RxLR-EER genes are: 122
strain: P421_v2 species: P.cactorum
the total number of SigP gene is: 7065
the number of unique SigP gene is: 2972
the number of SigP-RxLR genes are: 267
the number of SigP-RxLR-EER genes are: 115
strain: PC13_15 species: P.cactorum
the total number of SigP gene is: 16328
the number of unique SigP gene is: 3168
the number of SigP-RxLR genes are: 281
the number of SigP-RxLR-EER genes are: 121
strain: R36_14 species: P.cactorum
the total number of SigP gene is: 17028
the number of unique SigP gene is: 3379
the number of SigP-RxLR genes are: 288
the number of SigP-RxLR-EER genes are: 130
strain: 371 species: P.idaei
the total number of SigP gene is: 15536
the number of unique SigP gene is: 3049
the number of SigP-RxLR genes are: 255
the number of SigP-RxLR-EER genes are: 121
strain: SCRP370 species: P.idaei
the total number of SigP gene is: 15635
the number of unique SigP gene is: 3104
the number of SigP-RxLR genes are: 258
the number of SigP-RxLR-EER genes are: 116
strain: SCRP376 species: P.idaei
the total number of SigP gene is: 14942
the number of unique SigP gene is: 2936
the number of SigP-RxLR genes are: 248
the number of SigP-RxLR-EER genes are: 112
for Proteome in $(ls gene_pred/codingquary/*/*/*/final_genes_combined.pep.fasta | grep -e 'P.cactorum' -e 'P.idaei' | grep -v -e '414' -e '10300'); do
ProgDir=/projects/oldhome/armita/git_repos/emr_repos/scripts/phytophthora/pathogen/hmmer
HmmModel=/projects/oldhome/armita/git_repos/emr_repos/SI_Whisson_et_al_2007/cropped.hmm
Strain=$(echo $Proteome | rev | cut -f3 -d '/' | rev)
Organism=$(echo $Proteome | rev | cut -f4 -d '/' | rev)
OutDir=analysis/RxLR_effectors/hmmer_RxLR/$Organism/$Strain
mkdir -p $OutDir
HmmResults="$Strain"_RxLR_hmmer.txt
hmmsearch -T 0 $HmmModel $Proteome > $OutDir/$HmmResults
echo "$Organism $Strain"
cat $OutDir/$HmmResults | grep 'Initial search space'
cat $OutDir/$HmmResults | grep 'number of targets reported over threshold'
HmmFasta="$Strain"_RxLR_hmmer.fa
$ProgDir/hmmer2fasta.pl $OutDir/$HmmResults $Proteome > $OutDir/$HmmFasta
Headers="$Strain"_RxLR_hmmer_headers.txt
cat $OutDir/$HmmFasta | grep '>' | cut -f1 | tr -d '>' | sed -r 's/\.t.*//' | tr -d ' ' | sort | uniq > $OutDir/$Headers
Gff=$(ls gene_pred/*/$Organism/$Strain/final/final_genes_appended.gff3)
cat $Gff | grep -w -f $OutDir/$Headers > $OutDir/"$Strain"_Aug_RxLR_regex.gff3
done
P.cactorum 404
Initial search space (Z): 27775 [actual number of targets]
Domain search space (domZ): 127 [number of targets reported over threshold]
P.cactorum 414
Initial search space (Z): 32832 [actual number of targets]
Domain search space (domZ): 146 [number of targets reported over threshold]
P.cactorum 415
Initial search space (Z): 31944 [actual number of targets]
Domain search space (domZ): 132 [number of targets reported over threshold]
P.cactorum 416
Initial search space (Z): 32780 [actual number of targets]
Domain search space (domZ): 129 [number of targets reported over threshold]
P.cactorum 62471
Initial search space (Z): 27191 [actual number of targets]
Domain search space (domZ): 142 [number of targets reported over threshold]
P.idaei 371
Initial search space (Z): 27253 [actual number of targets]
Domain search space (domZ): 107 [number of targets reported over threshold]
P.idaei SCRP370
Initial search space (Z): 26983 [actual number of targets]
Domain search space (domZ): 105 [number of targets reported over threshold]
The total RxLRs are
for RegexRxLR in $(ls analysis/RxLR_effectors/RxLR_EER_regex_finder/*/*/*_RxLR_EER_regex.txt | grep -e 'P.cactorum' -e 'P.idaei' | grep -v -e '414' -e '10300'); do
Organism=$(echo $RegexRxLR | rev | cut -d '/' -f3 | rev)
Strain=$(echo $RegexRxLR | rev | cut -d '/' -f2 | rev)
Gff=$(ls gene_pred/*/$Organism/$Strain/final/final_genes_appended.gff3)
Proteome=$(ls gene_pred/codingquary/$Organism/$Strain/*/final_genes_combined.pep.fasta)
HmmRxLR=analysis/RxLR_effectors/hmmer_RxLR/$Organism/$Strain/*_RxLR_hmmer_headers.txt
echo "$Organism - $Strain"
echo "Number of RxLRs identified by Regex:"
cat $RegexRxLR | sort | uniq | wc -l
echo "Number of RxLRs identified by Hmm:"
cat $HmmRxLR | sort | uniq | wc -l
echo "Number of RxLRs in combined dataset:"
cat $RegexRxLR $HmmRxLR | sort | uniq | wc -l
# echo "Number of RxLRs in both datasets:"
# cat $RegexRxLR $HmmRxLR | sort | uniq -d | wc -l
echo ""
# echo "Extracting RxLRs from datasets"
OutDir=analysis/RxLR_effectors/combined_evidence/$Organism/$Strain
mkdir -p $OutDir
cat $RegexRxLR $HmmRxLR | sort | uniq > $OutDir/"$Strain"_total_RxLR_headers.txt
Gff=$(ls gene_pred/*/$Organism/$Strain/final/final_genes_appended.gff3)
cat $Gff | grep -w -f $OutDir/"$Strain"_total_RxLR_headers.txt > $OutDir/"$Strain"_total_RxLR.gff
echo "Number of genes in the extracted gff file:"
cat $OutDir/"$Strain"_total_RxLR.gff | grep -w 'gene' | wc -l
done
P.cactorum - 11-40
Number of RxLRs identified by Regex:
114
Number of RxLRs identified by Hmm:
130
Number of RxLRs in combined dataset:
154
Number of genes in the extracted gff file:
154
P.cactorum - 12420
Number of RxLRs identified by Regex:
113
Number of RxLRs identified by Hmm:
124
Number of RxLRs in combined dataset:
145
Number of genes in the extracted gff file:
145
P.cactorum - 15_13
Number of RxLRs identified by Regex:
111
Number of RxLRs identified by Hmm:
119
Number of RxLRs in combined dataset:
144
Number of genes in the extracted gff file:
144
P.cactorum - 15_7
Number of RxLRs identified by Regex:
105
Number of RxLRs identified by Hmm:
115
Number of RxLRs in combined dataset:
136
Number of genes in the extracted gff file:
136
P.cactorum - 17-21
Number of RxLRs identified by Regex:
131
Number of RxLRs identified by Hmm:
134
Number of RxLRs in combined dataset:
165
Number of genes in the extracted gff file:
165
P.cactorum - 2003_3
Number of RxLRs identified by Regex:
119
Number of RxLRs identified by Hmm:
128
Number of RxLRs in combined dataset:
152
Number of genes in the extracted gff file:
152
P.cactorum - 4032
Number of RxLRs identified by Regex:
111
Number of RxLRs identified by Hmm:
120
Number of RxLRs in combined dataset:
143
Number of genes in the extracted gff file:
143
P.cactorum - 4040
Number of RxLRs identified by Regex:
118
Number of RxLRs identified by Hmm:
126
Number of RxLRs in combined dataset:
150
Number of genes in the extracted gff file:
150
P.cactorum - 404
Number of RxLRs identified by Regex:
117
Number of RxLRs identified by Hmm:
130
Number of RxLRs in combined dataset:
154
Number of genes in the extracted gff file:
154
P.cactorum - 415
Number of RxLRs identified by Regex:
120
Number of RxLRs identified by Hmm:
127
Number of RxLRs in combined dataset:
155
Number of genes in the extracted gff file:
152
P.cactorum - 416
Number of RxLRs identified by Regex:
121
Number of RxLRs identified by Hmm:
128
Number of RxLRs in combined dataset:
156
Number of genes in the extracted gff file:
152
P.cactorum - 62471
Number of RxLRs identified by Regex:
133
Number of RxLRs identified by Hmm:
139
Number of RxLRs in combined dataset:
167
Number of genes in the extracted gff file:
167
P.cactorum - P295
Number of RxLRs identified by Regex:
122
Number of RxLRs identified by Hmm:
130
Number of RxLRs in combined dataset:
155
Number of genes in the extracted gff file:
155
P.cactorum - P421_v2
Number of RxLRs identified by Regex:
114
Number of RxLRs identified by Hmm:
128
Number of RxLRs in combined dataset:
151
Number of genes in the extracted gff file:
151
P.cactorum - PC13_15
Number of RxLRs identified by Regex:
120
Number of RxLRs identified by Hmm:
131
Number of RxLRs in combined dataset:
154
Number of genes in the extracted gff file:
154
P.cactorum - R36_14
Number of RxLRs identified by Regex:
130
Number of RxLRs identified by Hmm:
141
Number of RxLRs in combined dataset:
168
Number of genes in the extracted gff file:
168
P.idaei - 371
Number of RxLRs identified by Regex:
120
Number of RxLRs identified by Hmm:
100
Number of RxLRs in combined dataset:
143
Number of genes in the extracted gff file:
133
P.idaei - SCRP370
Number of RxLRs identified by Regex:
115
Number of RxLRs identified by Hmm:
108
Number of RxLRs in combined dataset:
144
Number of genes in the extracted gff file:
144
P.idaei - SCRP376
Number of RxLRs identified by Regex:
112
Number of RxLRs identified by Hmm:
106
Number of RxLRs in combined dataset:
137
Number of genes in the extracted gff file:
137
for Secretome in $(ls gene_pred/combined_sigP/*/*/*_all_secreted.fa | grep -e 'P.cactorum' -e 'P.idaei' | grep -v -e '414' -e '10300'); do
ProgDir=/projects/oldhome/armita/git_repos/emr_repos/scripts/phytophthora/pathogen/hmmer
HmmModel=/projects/oldhome/armita/git_repos/emr_repos/scripts/phytophthora/pathogen/hmmer/WY_motif.hmm
Strain=$(echo $Secretome | rev | cut -f2 -d '/' | rev)
Organism=$(echo $Secretome | rev | cut -f3 -d '/' | rev)
OutDir=analysis/RxLR_effectors/hmmer_WY/$Organism/$Strain
mkdir -p $OutDir
HmmResults="$Strain"_Aug_WY_hmmer.txt
hmmsearch -T 0 $HmmModel $Secretome > $OutDir/$HmmResults
echo "$Organism $Strain"
cat $OutDir/$HmmResults | grep 'Initial search space'
cat $OutDir/$HmmResults | grep 'number of targets reported over threshold'
HmmFasta="$Strain"_Aug_WY_hmmer.fa
$ProgDir/hmmer2fasta.pl $OutDir/$HmmResults $Secretome > $OutDir/$HmmFasta
Headers="$Strain"_Aug_WY_hmmer_headers.txt
# cat $OutDir/$HmmFasta | grep '>' | cut -f1 | tr -d '>' | sed -r 's/\.t.*//' | tr -d ' ' | sort | uniq > $OutDir/$Headers
cat $OutDir/$HmmFasta | grep '>' | cut -f1 | tr -d '>' | cut -f1 | sort | uniq > $OutDir/$Headers
# Gff=$(ls gene_pred/*/$Organism/$Strain/final/final_genes_appended_renamed.gff3)
# cat $Gff | grep -w -f $OutDir/$Headers > $OutDir/"$Strain"_Aug_WY_hmmer.gff
# ProgDir=/projects/oldhome/armita/git_repos/emr_repos/tools/seq_tools/feature_annotation
# $ProgDir/gene_list_to_gff.pl $OutDir/$Headers $Gff $HmmModel Name Augustus > $OutDir/"$Strain"_Aug_WY_hmmer.gff
done
A hmm model relating to crinkler domains was used to identify putative crinklers in Augustus gene models. This was done with the following commands:
HmmDir=/projects/oldhome/groups/harrisonlab/project_files/idris/analysis/CRN_effectors/hmmer_models
LFLAK_hmm=$(ls $HmmDir/Pinf_Pram_Psoj_Pcap_LFLAK.hmm)
DWL_hmm=$(ls $HmmDir/Pinf_Pram_Psoj_Pcap_DWL.hmm)
for Proteome in $(ls gene_pred/codingquary/*/*/*/final_genes_combined.pep.fasta | grep -e 'P.cactorum' -e 'P.idaei' | grep -v -e '414' -e '10300'); do
Strain=$(echo $Proteome | rev | cut -f3 -d '/' | rev)
Organism=$(echo $Proteome | rev | cut -f4 -d '/' | rev)
OutDir=analysis/CRN_effectors/hmmer_CRN/$Organism/$Strain
mkdir -p $OutDir
echo "$Organism - $Strain"
# Run hmm searches LFLAK domains
CrinklerProts_LFLAK=$OutDir/"$Strain"_pub_CRN_LFLAK_hmm.txt
hmmsearch -T0 $LFLAK_hmm $Proteome > $CrinklerProts_LFLAK
cat $CrinklerProts_LFLAK | grep 'Initial search space'
cat $CrinklerProts_LFLAK | grep 'number of targets reported over threshold'
ProgDir=/projects/oldhome/armita/git_repos/emr_repos/scripts/phytophthora/pathogen/hmmer
$ProgDir/hmmer2fasta.pl $CrinklerProts_LFLAK $Proteome > $OutDir/"$Strain"_pub_CRN_LFLAK_hmm.fa
# Run hmm searches DWL domains
CrinklerProts_DWL=$OutDir/"$Strain"_pub_CRN_DWL_hmm.txt
hmmsearch -T0 $DWL_hmm $Proteome > $CrinklerProts_DWL
cat $CrinklerProts_DWL | grep 'Initial search space'
cat $CrinklerProts_DWL | grep 'number of targets reported over threshold'
ProgDir=/projects/oldhome/armita/git_repos/emr_repos/scripts/phytophthora/pathogen/hmmer
$ProgDir/hmmer2fasta.pl $CrinklerProts_DWL $Proteome > $OutDir/"$Strain"_pub_CRN_DWL_hmm.fa
# Identify the genes detected in both models
cat $OutDir/"$Strain"_pub_CRN_LFLAK_hmm.fa $OutDir/"$Strain"_pub_CRN_DWL_hmm.fa | grep '>' | cut -f1 | tr -d '>' | sort | uniq -d > $OutDir/"$Strain"_pub_CRN_LFLAK_DWL.txt
cat $OutDir/"$Strain"_pub_CRN_LFLAK_DWL.txt | wc -l
done
P.cactorum - 11-40
Initial search space (Z): 28827 [actual number of targets]
Domain search space (domZ): 109 [number of targets reported over threshold]
Initial search space (Z): 28827 [actual number of targets]
Domain search space (domZ): 88 [number of targets reported over threshold]
59
P.cactorum - 12420
Initial search space (Z): 26231 [actual number of targets]
Domain search space (domZ): 108 [number of targets reported over threshold]
Initial search space (Z): 26231 [actual number of targets]
Domain search space (domZ): 86 [number of targets reported over threshold]
66
P.cactorum - 15_13
Initial search space (Z): 26744 [actual number of targets]
Domain search space (domZ): 114 [number of targets reported over threshold]
Initial search space (Z): 26744 [actual number of targets]
Domain search space (domZ): 99 [number of targets reported over threshold]
72
P.cactorum - 15_7
Initial search space (Z): 26604 [actual number of targets]
Domain search space (domZ): 110 [number of targets reported over threshold]
Initial search space (Z): 26604 [actual number of targets]
Domain search space (domZ): 88 [number of targets reported over threshold]
71
P.cactorum - 17-21
Initial search space (Z): 28577 [actual number of targets]
Domain search space (domZ): 103 [number of targets reported over threshold]
Initial search space (Z): 28577 [actual number of targets]
Domain search space (domZ): 90 [number of targets reported over threshold]
59
P.cactorum - 2003_3
Initial search space (Z): 29786 [actual number of targets]
Domain search space (domZ): 111 [number of targets reported over threshold]
Initial search space (Z): 29786 [actual number of targets]
Domain search space (domZ): 97 [number of targets reported over threshold]
61
P.cactorum - 4032
Initial search space (Z): 26819 [actual number of targets]
Domain search space (domZ): 121 [number of targets reported over threshold]
Initial search space (Z): 26819 [actual number of targets]
Domain search space (domZ): 96 [number of targets reported over threshold]
77
P.cactorum - 4040
Initial search space (Z): 29530 [actual number of targets]
Domain search space (domZ): 122 [number of targets reported over threshold]
Initial search space (Z): 29530 [actual number of targets]
Domain search space (domZ): 93 [number of targets reported over threshold]
73
P.cactorum - 404
Initial search space (Z): 35766 [actual number of targets]
Domain search space (domZ): 96 [number of targets reported over threshold]
Initial search space (Z): 35766 [actual number of targets]
Domain search space (domZ): 84 [number of targets reported over threshold]
51
P.cactorum - 415
Initial search space (Z): 26796 [actual number of targets]
Domain search space (domZ): 123 [number of targets reported over threshold]
Initial search space (Z): 26796 [actual number of targets]
Domain search space (domZ): 94 [number of targets reported over threshold]
75
P.cactorum - 416
Initial search space (Z): 29391 [actual number of targets]
Domain search space (domZ): 130 [number of targets reported over threshold]
Initial search space (Z): 29391 [actual number of targets]
Domain search space (domZ): 107 [number of targets reported over threshold]
78
P.cactorum - 62471
Initial search space (Z): 29596 [actual number of targets]
Domain search space (domZ): 109 [number of targets reported over threshold]
Initial search space (Z): 29596 [actual number of targets]
Domain search space (domZ): 90 [number of targets reported over threshold]
61
P.cactorum - P295
Initial search space (Z): 26327 [actual number of targets]
Domain search space (domZ): 111 [number of targets reported over threshold]
Initial search space (Z): 26327 [actual number of targets]
Domain search space (domZ): 99 [number of targets reported over threshold]
73
P.cactorum - P421_v2
Initial search space (Z): 26124 [actual number of targets]
Domain search space (domZ): 97 [number of targets reported over threshold]
Initial search space (Z): 26124 [actual number of targets]
Domain search space (domZ): 79 [number of targets reported over threshold]
58
P.cactorum - PC13_15
Initial search space (Z): 26607 [actual number of targets]
Domain search space (domZ): 111 [number of targets reported over threshold]
Initial search space (Z): 26607 [actual number of targets]
Domain search space (domZ): 99 [number of targets reported over threshold]
74
P.cactorum - R36_14
Initial search space (Z): 29929 [actual number of targets]
Domain search space (domZ): 126 [number of targets reported over threshold]
Initial search space (Z): 29929 [actual number of targets]
Domain search space (domZ): 108 [number of targets reported over threshold]
81
P.idaei - 371
Initial search space (Z): 25387 [actual number of targets]
Domain search space (domZ): 75 [number of targets reported over threshold]
Initial search space (Z): 25387 [actual number of targets]
Domain search space (domZ): 66 [number of targets reported over threshold]
41
P.idaei - SCRP370
Initial search space (Z): 27882 [actual number of targets]
Domain search space (domZ): 61 [number of targets reported over threshold]
Initial search space (Z): 27882 [actual number of targets]
Domain search space (domZ): 47 [number of targets reported over threshold]
29
P.idaei - SCRP376
Initial search space (Z): 25490 [actual number of targets]
Domain search space (domZ): 57 [number of targets reported over threshold]
Initial search space (Z): 25490 [actual number of targets]
Domain search space (domZ): 48 [number of targets reported over threshold]
26
Extract gff annotations for Crinklers:
for CRNlist in $(ls analysis/CRN_effectors/hmmer_CRN/*/*/*_pub_CRN_LFLAK_DWL.txt | grep -e 'P.cactorum' -e 'P.idaei' | grep -v -e '414' -e '10300'); do
Strain=$(echo $CRNlist | rev | cut -f2 -d '/' | rev)
Organism=$(echo $CRNlist | rev | cut -f3 -d '/' | rev)
OutName=$(echo $CRNlist | sed 's/.txt/.gff/g')
echo "$Organism - $Strain"
Gff=$(ls gene_pred/*/$Organism/$Strain/final/final_genes_appended.gff3)
cat $CRNlist | sed -r 's/\.t.$//g' > tmp.txt
cat $Gff | grep -w -f tmp.txt > $OutName
rm tmp.txt
done
Required programs:
- SigP
- Phobius
- biopython
Proteins that were predicted to contain signal peptides were identified using the following commands:
for Proteome in $(ls gene_pred/ORF_finder/P.*/*/*.aa_cat.fa | grep -e 'P.cactorum' -e 'P.idaei' | grep -v -e '414' -e '10300' | grep -e '415' -e '416'); do
SplitfileDir=/projects/oldhome/armita/git_repos/emr_repos/tools/seq_tools/feature_annotation/signal_peptides
ProgDir=/projects/oldhome/armita/git_repos/emr_repos/tools/seq_tools/feature_annotation/signal_peptides
Strain=$(echo $Proteome | rev | cut -f2 -d '/' | rev)
Organism=$(echo $Proteome | rev | cut -f3 -d '/' | rev)
SplitDir=gene_pred/ORF_split/$Organism/$Strain
mkdir -p $SplitDir
BaseName="$Organism""_$Strain"_ORF_preds
$SplitfileDir/splitfile_500.py --inp_fasta $Proteome --out_dir $SplitDir --out_base $BaseName
for File in $(ls $SplitDir/*_ORF_preds_*); do
Jobs=$(qstat | grep 'pred_sigP' | grep 'qw' | wc -l)
while [ $Jobs -gt 6 ]; do
sleep 1
printf "."
Jobs=$(qstat | grep 'pred_sigP' | grep 'qw' | wc -l)
done
printf "\n"
echo $File
qsub $ProgDir/pred_sigP.sh $File
# qsub $ProgDir/pred_sigP.sh $File signalp-3.0
qsub $ProgDir/pred_sigP.sh $File signalp-4.1
done
done
The batch files of predicted secreted proteins needed to be combined into a single file for each strain. This was done with the following commands:
for SplitDir in $(ls -d gene_pred/ORF_split/*/* | grep -e 'P.cactorum' -e 'P.idaei' | grep -v -e '414' -e '10300' | grep -e '415' -e '416'); do
Strain=$(echo $SplitDir | cut -d '/' -f4)
Organism=$(echo $SplitDir | cut -d '/' -f3)
echo "$Organism - $Strain"
for SigpDir in $(ls -d gene_pred/ORF_sig* | grep -v '_signalp-3.0' | cut -f2 -d'/'); do
InStringAA=''
InStringNeg=''
InStringTab=''
InStringTxt=''
for GRP in $(ls -l $SplitDir/*_ORF_*.fa | rev | cut -d '_' -f1 | rev | sort -n); do
InStringAA="$InStringAA gene_pred/$SigpDir/$Organism/$Strain/split/"$Organism"_"$Strain"_ORF_preds_$GRP""_sp.aa";
InStringNeg="$InStringNeg gene_pred/$SigpDir/$Organism/$Strain/split/"$Organism"_"$Strain"_ORF_preds_$GRP""_sp_neg.aa";
InStringTab="$InStringTab gene_pred/$SigpDir/$Organism/$Strain/split/"$Organism"_"$Strain"_ORF_preds_$GRP""_sp.tab";
InStringTxt="$InStringTxt gene_pred/$SigpDir/$Organism/$Strain/split/"$Organism"_"$Strain"_ORF_preds_$GRP""_sp.txt";
done
cat $InStringAA > gene_pred/$SigpDir/$Organism/$Strain/"$Strain"_ORF_sp.aa
cat $InStringNeg > gene_pred/$SigpDir/$Organism/$Strain/"$Strain"_ORF_neg_sp.aa
tail -n +2 -q $InStringTab > gene_pred/$SigpDir/$Organism/$Strain/"$Strain"_ORF_sp.tab
cat $InStringTxt > gene_pred/$SigpDir/$Organism/$Strain/"$Strain"_ORF_sp.txt
done
done
Secreted proteins were also predicted using Phobius
for Proteome in $(ls gene_pred/ORF_finder/P.*/*/*.aa_cat.fa | grep -e 'P.cactorum' -e 'P.idaei' | grep -v -e '414' -e '10300'); do
Strain=$(echo $Proteome | rev | cut -f2 -d '/' | rev)
Organism=$(echo $Proteome | rev | cut -f3 -d '/' | rev)
echo "$Organism - $Strain"
OutDir=analysis/phobius/$Organism/$Strain
mkdir -p $OutDir
phobius.pl $Proteome > $OutDir/"$Strain"_phobius_ORF.txt
cat $OutDir/"$Strain"_phobius_ORF.txt | grep -B1 'SIGNAL' | grep 'ID' | sed s'/ID.*g/g/g' > $OutDir/"$Strain"_phobius_headers_ORF.txt
done
Secreted proteins from different sources were combined into a single file:
for Proteome in $(ls gene_pred/ORF_finder/P.*/*/*.aa_cat.fa | grep -e 'P.cactorum' -e 'P.idaei' | grep -v -e '414' -e '10300'); do
Strain=$(echo $Proteome | rev | cut -f2 -d '/' | rev)
Organism=$(echo $Proteome | rev | cut -f3 -d '/' | rev)
echo "$Organism - $Strain"
OutDir=gene_pred/combined_sigP_ORF/$Organism/$Strain
mkdir -p $OutDir
echo "The following number of sequences were predicted as secreted:"
cat gene_pred/ORF_sig*/$Organism/$Strain/*_ORF_sp.aa > $OutDir/"$Strain"_all_secreted.fa
cat $OutDir/"$Strain"_all_secreted.fa | grep '>' | tr -d '>' | tr -d ' ' | sed "s/HMM_score\t/HMM_score=\t/g" > $OutDir/"$Strain"_all_secreted_headers.txt
cat $OutDir/"$Strain"_all_secreted_headers.txt | wc -l
echo "This represented the following number of unique genes:"
cat gene_pred/ORF_sig*/$Organism/$Strain/*_ORF_sp.aa | grep '>' | cut -f1 | tr -d ' >' | sort -g | uniq > $OutDir/"$Strain"_secreted.txt
ProgDir=/projects/oldhome/armita/git_repos/emr_repos/tools/gene_prediction/ORF_finder
$ProgDir/extract_from_fasta.py --fasta $Proteome --headers $OutDir/"$Strain"_secreted.txt > $OutDir/"$Strain"_secreted.fa
cat $OutDir/"$Strain"_secreted.fa | grep '>' | wc -l
done
P.cactorum - 11-40
The following number of sequences were predicted as secreted:
17518
This represented the following number of unique genes:
17491
P.cactorum - 12420
The following number of sequences were predicted as secreted:
17579
This represented the following number of unique genes:
17525
P.cactorum - 15_13
The following number of sequences were predicted as secreted:
17687
This represented the following number of unique genes:
17660
P.cactorum - 15_7
The following number of sequences were predicted as secreted:
21929
This represented the following number of unique genes:
19425
P.cactorum - 17-21
The following number of sequences were predicted as secreted:
17481
This represented the following number of unique genes:
17469
P.cactorum - 2003_3
The following number of sequences were predicted as secreted:
17630
This represented the following number of unique genes:
17630
P.cactorum - 4032
The following number of sequences were predicted as secreted:
22067
This represented the following number of unique genes:
19641
P.cactorum - 4040
The following number of sequences were predicted as secreted:
21875
This represented the following number of unique genes:
19427
P.cactorum - 404
The following number of sequences were predicted as secreted:
43020
This represented the following number of unique genes:
29642
P.cactorum - 415
The following number of sequences were predicted as secreted:
41447
This represented the following number of unique genes:
27389
P.cactorum - 416
The following number of sequences were predicted as secreted:
42503
This represented the following number of unique genes:
27683
P.cactorum - 62471
The following number of sequences were predicted as secreted:
17830
This represented the following number of unique genes:
17816
P.cactorum - P295
The following number of sequences were predicted as secreted:
21889
This represented the following number of unique genes:
19461
P.cactorum - P421_v2
The following number of sequences were predicted as secreted:
17434
This represented the following number of unique genes:
17434
P.cactorum - PC13_15
The following number of sequences were predicted as secreted:
17837
This represented the following number of unique genes:
17837
P.cactorum - R36_14
The following number of sequences were predicted as secreted:
17959
This represented the following number of unique genes:
17959
P.idaei - 371
The following number of sequences were predicted as secreted:
37207
This represented the following number of unique genes:
25622
P.idaei - SCRP370
The following number of sequences were predicted as secreted:
17254
This represented the following number of unique genes:
17254
P.idaei - SCRP376
The following number of sequences were predicted as secreted:
17208
This represented the following number of unique genes:
17208
Names of ORFs containing signal peptides were extracted from fasta files. This included information on the position and hmm score of RxLRs.
FastaFile=gene_pred/ORF_sigP/$Organism/$Strain/"$Strain"_ORF_sp.aa
SigP_headers=gene_pred/ORF_sigP/$Organism/$Strain/"$Strain"_ORF_sp_names.txt
cat $FastaFile | grep '>' | sed -r 's/>//g' | sed -r 's/\s+/\t/g'| sed 's/=\t/=/g' | sed 's/--//g' > $SigP_headers
Due to the nature of predicting ORFs, some features overlapped with one another. A single ORF was selected from each set of overlapped ORFs. This was was selected on the basis of its SignalP Hmm score. Biopython was used to identify overlaps and identify the ORF with the best signalP score.
Due to the nature of predicting ORFs, some features overlapped with one another. A single ORF was selected from each set of overlapped ORFs. This was was selected on the basis of its SignalP Hmm score. Biopython was used to identify overlaps and identify the ORF with the best signalP score.
for ORF_Gff in $(ls gene_pred/ORF_finder/*/*/*_ORF.gff3 | grep -v -e 'atg' | grep -e 'P.cactorum' -e 'P.idaei' | grep -v -e '414' -e '10300'); do
Organism=$(echo $ORF_Gff | rev | cut -d '/' -f3 | rev) ;
Strain=$(echo $ORF_Gff | rev | cut -d '/' -f2 | rev);
OutDir=$(ls -d gene_pred/combined_sigP_ORF/$Organism/$Strain)
echo "$Organism - $Strain"
# SigP_fasta=$(ls $OutDir/"$Strain"_all_secreted.fa)
SigP_headers=$(ls $OutDir/"$Strain"_all_secreted_headers.txt)
ORF_fasta=$(ls gene_pred/ORF_finder/$Organism/$Strain/"$Strain".aa_cat.fa)
SigP_Gff=$OutDir/"$Strain"_all_secreted_unmerged.gff
SigP_Merged_Gff=$OutDir/"$Strain"_all_secreted_merged.gff
SigP_Merged_txt=$OutDir/"$Strain"_all_secreted_merged.txt
SigP_Merged_AA=$OutDir/"$Strain"_all_secreted_merged.aa
ProgDir=/projects/oldhome/armita/git_repos/emr_repos/tools/gene_prediction/ORF_finder
$ProgDir/extract_gff_for_sigP_hits.pl $SigP_headers $ORF_Gff SigP Name > $SigP_Gff
ProgDir=~/git_repos/emr_repos/scripts/phytophthora/pathogen/merge_gff
$ProgDir/make_gff_database.py --inp $SigP_Gff --db sigP_ORF.db
ProgDir=/projects/oldhome/armita/git_repos/emr_repos/tools/gene_prediction/ORF_finder
$ProgDir/merge_sigP_ORFs.py --inp sigP_ORF.db --id sigP_ORF --out sigP_ORF_merged.db --gff > $SigP_Merged_Gff
cat $SigP_Merged_Gff | grep 'transcript' | rev | cut -f1 -d'=' | rev > $SigP_Merged_txt
# $ProgDir/extract_from_fasta.py --fasta $SigP_fasta --headers $SigP_Merged_txt > $SigP_Merged_AA
$ProgDir/extract_from_fasta.py --fasta $ORF_fasta --headers $SigP_Merged_txt > $SigP_Merged_AA
done
The regular expression R.LR.{,40}[ED][ED][KR] has previously been used to identify RxLR effectors. The addition of an EER motif is significant as it has been shown as required for host uptake of the protein.
The RxLR_EER_regex_finder.py script was used to search for this regular expression and annotate the EER domain where present.
for Secretome in $(ls gene_pred/combined_sigP_ORF/*/*/*_all_secreted.fa | grep -v -e 'atg' | grep -e 'P.cactorum' -e 'P.idaei' | grep -v -e '414' -e '10300'); do
ProgDir=~/git_repos/emr_repos/tools/pathogen/RxLR_effectors
Strain=$(echo $Secretome | rev | cut -d '/' -f2 | rev);
Organism=$(echo $Secretome | rev | cut -d '/' -f3 | rev) ;
OutDir=analysis/RxLR_effectors/RxLR_EER_regex_finder/"$Organism"/"$Strain";
mkdir -p $OutDir;
printf "\nstrain: $Strain\tspecies: $Organism\n";
printf "the number of SigP gene is:\t";
cat $Secretome | grep '>' | cut -f1 | tr -d ' ' | sort | uniq | wc -l
printf "the number of SigP-RxLR genes are:\t";
$ProgDir/RxLR_EER_regex_finder.py $Secretome > $OutDir/"$Strain"_ORF_RxLR_EER_regex_unmerged.fa;
cat $OutDir/"$Strain"_ORF_RxLR_EER_regex_unmerged.fa | grep '>' | cut -f1 | tr -d '>' | sed -r 's/\.t.*//' | tr -d ' ' > $OutDir/"$Strain"_ORF_RxLR_regex_unmerged.txt
cat $OutDir/"$Strain"_ORF_RxLR_regex_unmerged.txt | tr -d ' ' | sort | uniq | wc -l
printf "the number of SigP-RxLR-EER genes are:\t";
cat $OutDir/"$Strain"_ORF_RxLR_EER_regex_unmerged.fa | grep '>' | grep 'EER_motif_start' | cut -f1 | tr -d '>' | sed -r 's/\.t.*//' | tr -d ' '> $OutDir/"$Strain"_ORF_RxLR_EER_regex_unmerged.txt
cat $OutDir/"$Strain"_ORF_RxLR_EER_regex_unmerged.txt | tr -d ' ' | sort | uniq | wc -l
ProgDir=/projects/oldhome/armita/git_repos/emr_repos/tools/seq_tools/feature_annotation
# $ProgDir/gene_list_to_gff.pl $OutDir/"$Strain"_ORF_RxLR_regex_unmerged.txt $SigP_Merged_Gff RxLR_EER_regex_finder.py Name Augustus > $OutDir/"$Strain"_ORF_RxLR_regex_unmerged.gff
SigP_Gff=gene_pred/combined_sigP_ORF/$Organism/$Strain/"$Strain"_all_secreted_unmerged.gff
ORF_fasta=$(ls gene_pred/ORF_finder/$Organism/$Strain/"$Strain".aa_cat.fa)
ProgDir=/projects/oldhome/armita/git_repos/emr_repos/tools/seq_tools/feature_annotation
$ProgDir/gene_list_to_gff.pl $OutDir/"$Strain"_ORF_RxLR_regex_unmerged.txt $SigP_Gff RxLR_regex_finder.py Name Augustus > $OutDir/"$Strain"_ORF_RxLR_regex_unmerged.gff
$ProgDir/gene_list_to_gff.pl $OutDir/"$Strain"_ORF_RxLR_EER_regex_unmerged.txt $SigP_Gff RxLR_EER_regex_finder.py Name Augustus > $OutDir/"$Strain"_ORF_RxLR_EER_regex_unmerged.gff
# ProgDir=/projects/oldhome/armita/git_repos/emr_repos/tools/seq_tools/feature_annotation
# $ProgDir/gene_list_to_gff.pl $OutDir/"$Strain"_ORF_RxLR_EER_regex_unmerged.txt $SigP_Merged_Gff RxLR_EER_regex_finder.py Name Augustus > $OutDir/"$Strain"_ORF_RxLR_EER_regex.gff
RxLR_Merged_Gff=$OutDir/"$Strain"_ORF_RxLR_regex_merged.gff
RxLR_Merged_txt=$OutDir/"$Strain"_ORF_RxLR_regex_merged.txt
RxLR_Merged_AA=$OutDir/"$Strain"_ORF_RxLR_regex_merged.aa
RxLR_EER_Merged_Gff=$OutDir/"$Strain"_ORF_RxLR_EER_regex_merged.gff
RxLR_EER_Merged_txt=$OutDir/"$Strain"_ORF_RxLR_EER_regex_merged.txt
RxLR_EER_Merged_AA=$OutDir/"$Strain"_ORF_RxLR_EER_regex_merged.aa
ProgDir=~/git_repos/emr_repos/scripts/phytophthora/pathogen/merge_gff
$ProgDir/make_gff_database.py --inp $OutDir/"$Strain"_ORF_RxLR_regex_unmerged.gff --db sigP_ORF_RxLR.db
$ProgDir/make_gff_database.py --inp $OutDir/"$Strain"_ORF_RxLR_EER_regex_unmerged.gff --db sigP_ORF_RxLR_EER.db
ProgDir=/projects/oldhome/armita/git_repos/emr_repos/tools/gene_prediction/ORF_finder
$ProgDir/merge_sigP_ORFs.py --inp sigP_ORF_RxLR.db --id sigP_ORF_RxLR --out sigP_ORF_RxLR_merged.db --gff > $RxLR_Merged_Gff
$ProgDir/merge_sigP_ORFs.py --inp sigP_ORF_RxLR_EER.db --id sigP_ORF_RxLR_EER --out sigP_ORF_RxLR_EER_merged.db --gff > $RxLR_EER_Merged_Gff
cat $RxLR_Merged_Gff | grep 'transcript' | rev | cut -f1 -d '=' | rev > $RxLR_Merged_txt
cat $RxLR_EER_Merged_Gff | grep 'transcript' | rev | cut -f1 -d '=' | rev > $RxLR_EER_Merged_txt
$ProgDir/extract_from_fasta.py --fasta $ORF_fasta --headers $RxLR_Merged_txt > $RxLR_Merged_AA
$ProgDir/extract_from_fasta.py --fasta $ORF_fasta --headers $RxLR_EER_Merged_txt > $RxLR_EER_Merged_AA
printf "Merged RxLR regex proteins:\t"
cat $RxLR_Merged_AA | grep '>' | wc -l
printf "Merged RxLR-EER regex proteins:\t"
cat $RxLR_EER_Merged_AA | grep '>' | wc -l
printf "\n"
done
strain: 11-40 species: P.cactorum
the number of SigP gene is: 17491
the number of SigP-RxLR genes are: 1007
the number of SigP-RxLR-EER genes are: 178
Merged RxLR regex proteins: 869
Merged RxLR-EER regex proteins: 160
strain: 12420 species: P.cactorum
the number of SigP gene is: 17525
the number of SigP-RxLR genes are: 1034
the number of SigP-RxLR-EER genes are: 175
Merged RxLR regex proteins: 889
Merged RxLR-EER regex proteins: 158
strain: 15_13 species: P.cactorum
the number of SigP gene is: 17660
the number of SigP-RxLR genes are: 1055
the number of SigP-RxLR-EER genes are: 177
Merged RxLR regex proteins: 912
Merged RxLR-EER regex proteins: 159
strain: 15_7 species: P.cactorum
the number of SigP gene is: 19425
the number of SigP-RxLR genes are: 1151
the number of SigP-RxLR-EER genes are: 175
Merged RxLR regex proteins: 987
Merged RxLR-EER regex proteins: 158
strain: 17-21 species: P.cactorum
the number of SigP gene is: 17469
the number of SigP-RxLR genes are: 1030
the number of SigP-RxLR-EER genes are: 176
Merged RxLR regex proteins: 890
Merged RxLR-EER regex proteins: 162
strain: 2003_3 species: P.cactorum
the number of SigP gene is: 17630
the number of SigP-RxLR genes are: 1048
the number of SigP-RxLR-EER genes are: 174
Merged RxLR regex proteins: 908
Merged RxLR-EER regex proteins: 157
strain: 4032 species: P.cactorum
the number of SigP gene is: 19641
the number of SigP-RxLR genes are: 1152
the number of SigP-RxLR-EER genes are: 182
Merged RxLR regex proteins: 989
Merged RxLR-EER regex proteins: 163
strain: 4040 species: P.cactorum
the number of SigP gene is: 19427
the number of SigP-RxLR genes are: 1136
the number of SigP-RxLR-EER genes are: 177
Merged RxLR regex proteins: 969
Merged RxLR-EER regex proteins: 158
strain: 404 species: P.cactorum
the number of SigP gene is: 29642
the number of SigP-RxLR genes are: 1633
the number of SigP-RxLR-EER genes are: 192
Merged RxLR regex proteins: 1371
Merged RxLR-EER regex proteins: 168
strain: 415 species: P.cactorum
the number of SigP gene is: 28547
the number of SigP-RxLR genes are: 1692
the number of SigP-RxLR-EER genes are: 214
Merged RxLR regex proteins: 1372
Merged RxLR-EER regex proteins: 184
strain: 416 species: P.cactorum
the number of SigP gene is: 29221
the number of SigP-RxLR genes are: 1700
the number of SigP-RxLR-EER genes are: 211
Merged RxLR regex proteins: 1343
Merged RxLR-EER regex proteins: 179
strain: 62471 species: P.cactorum
the number of SigP gene is: 17816
the number of SigP-RxLR genes are: 1069
the number of SigP-RxLR-EER genes are: 192
Merged RxLR regex proteins: 927
Merged RxLR-EER regex proteins: 175
strain: P295 species: P.cactorum
the number of SigP gene is: 19461
the number of SigP-RxLR genes are: 1144
the number of SigP-RxLR-EER genes are: 193
Merged RxLR regex proteins: 982
Merged RxLR-EER regex proteins: 174
strain: P421_v2 species: P.cactorum
the number of SigP gene is: 17434
the number of SigP-RxLR genes are: 1014
the number of SigP-RxLR-EER genes are: 180
Merged RxLR regex proteins: 872
Merged RxLR-EER regex proteins: 163
strain: PC13_15 species: P.cactorum
the number of SigP gene is: 17837
the number of SigP-RxLR genes are: 1056
the number of SigP-RxLR-EER genes are: 180
Merged RxLR regex proteins: 916
Merged RxLR-EER regex proteins: 162
strain: R36_14 species: P.cactorum
the number of SigP gene is: 17959
the number of SigP-RxLR genes are: 1079
the number of SigP-RxLR-EER genes are: 185
Merged RxLR regex proteins: 934
Merged RxLR-EER regex proteins: 169
strain: 371 species: P.idaei
the number of SigP gene is: 25622
the number of SigP-RxLR genes are: 1529
the number of SigP-RxLR-EER genes are: 222
Merged RxLR regex proteins: 1254
Merged RxLR-EER regex proteins: 189
strain: SCRP370 species: P.idaei
the number of SigP gene is: 17254
the number of SigP-RxLR genes are: 1044
the number of SigP-RxLR-EER genes are: 194
Merged RxLR regex proteins: 887
Merged RxLR-EER regex proteins: 172
strain: SCRP376 species: P.idaei
the number of SigP gene is: 17208
the number of SigP-RxLR genes are: 1039
the number of SigP-RxLR-EER genes are: 193
Merged RxLR regex proteins: 880
Merged RxLR-EER regex proteins: 171
Hmm models for the WY domain contained in many RxLRs were used to search ORFs predicted with atg.pl. These were run with the following commands:
for Secretome in $(ls gene_pred/combined_sigP_ORF/*/*/*_all_secreted.fa | grep -v -e 'atg' | grep -e 'P.cactorum' -e 'P.idaei' | grep -v -e '414' -e '10300'); do
ProgDir=/projects/oldhome/armita/git_repos/emr_repos/scripts/phytophthora/pathogen/hmmer
HmmModel=/projects/oldhome/armita/git_repos/emr_repos/scripts/phytophthora/pathogen/hmmer/WY_motif.hmm
Strain=$(echo $Secretome | rev | cut -f2 -d '/' | rev)
Organism=$(echo $Secretome | rev | cut -f3 -d '/' | rev)
OutDir=analysis/RxLR_effectors/hmmer_WY/$Organism/$Strain
mkdir -p $OutDir
HmmResults="$Strain"_ORF_WY_hmmer.txt
hmmsearch -T 0 $HmmModel $Secretome > $OutDir/$HmmResults
echo "$Organism $Strain"
cat $OutDir/$HmmResults | grep 'Initial search space'
cat $OutDir/$HmmResults | grep 'number of targets reported over threshold'
HmmFasta="$Strain"_ORF_WY_hmmer.fa
$ProgDir/hmmer2fasta.pl $OutDir/$HmmResults $Secretome > $OutDir/$HmmFasta
Headers="$Strain"_ORF_WY_hmmer_headers.txt
cat $OutDir/$HmmFasta | grep '>' | cut -f1 | tr -d '>' | sed -r 's/\.t.*//' | tr -d ' ' > $OutDir/$Headers
SigP_Merged_Gff=$(ls gene_pred/combined_sigP_ORF/$Organism/$Strain/${Strain}_all_secreted_merged.gff)
ProgDir=/projects/oldhome/armita/git_repos/emr_repos/tools/seq_tools/feature_annotation
$ProgDir/gene_list_to_gff.pl $OutDir/$Headers $SigP_Merged_Gff $HmmModel Name Augustus > $OutDir/"$Strain"_ORF_WY_hmmer.gff
done
P.cactorum 11-40
Initial search space (Z): 17518 [actual number of targets]
Domain search space (domZ): 101 [number of targets reported over threshold]
P.cactorum 12420
Initial search space (Z): 17579 [actual number of targets]
Domain search space (domZ): 101 [number of targets reported over threshold]
P.cactorum 15_13
Initial search space (Z): 17687 [actual number of targets]
Domain search space (domZ): 100 [number of targets reported over threshold]
P.cactorum 15_7
Initial search space (Z): 21929 [actual number of targets]
Domain search space (domZ): 112 [number of targets reported over threshold]
P.cactorum 17-21
Initial search space (Z): 17481 [actual number of targets]
Domain search space (domZ): 100 [number of targets reported over threshold]
P.cactorum 2003_3
Initial search space (Z): 17630 [actual number of targets]
Domain search space (domZ): 102 [number of targets reported over threshold]
P.cactorum 4032
Initial search space (Z): 22067 [actual number of targets]
Domain search space (domZ): 112 [number of targets reported over threshold]
P.cactorum 4040
Initial search space (Z): 21875 [actual number of targets]
Domain search space (domZ): 109 [number of targets reported over threshold]
P.cactorum 404
Initial search space (Z): 43020 [actual number of targets]
Domain search space (domZ): 208 [number of targets reported over threshold]
P.cactorum 415
Initial search space (Z): 41447 [actual number of targets]
Domain search space (domZ): 234 [number of targets reported over threshold]
P.cactorum 416
Initial search space (Z): 42503 [actual number of targets]
Domain search space (domZ): 236 [number of targets reported over threshold]
P.cactorum 62471
Initial search space (Z): 17830 [actual number of targets]
Domain search space (domZ): 111 [number of targets reported over threshold]
P.cactorum P295
Initial search space (Z): 21889 [actual number of targets]
Domain search space (domZ): 124 [number of targets reported over threshold]
P.cactorum P421_v2
Initial search space (Z): 17434 [actual number of targets]
Domain search space (domZ): 103 [number of targets reported over threshold]
P.cactorum PC13_15
Initial search space (Z): 17837 [actual number of targets]
Domain search space (domZ): 104 [number of targets reported over threshold]
P.cactorum R36_14
Initial search space (Z): 17959 [actual number of targets]
Domain search space (domZ): 107 [number of targets reported over threshold]
P.idaei 371
Initial search space (Z): 37207 [actual number of targets]
Domain search space (domZ): 209 [number of targets reported over threshold]
P.idaei SCRP370
Initial search space (Z): 17254 [actual number of targets]
Domain search space (domZ): 106 [number of targets reported over threshold]
P.idaei SCRP376
Initial search space (Z): 17208 [actual number of targets]
Domain search space (domZ): 102 [number of targets reported over threshold]
for Secretome in $(ls gene_pred/combined_sigP_ORF/*/*/*_all_secreted.fa | grep -e 'P.cactorum' -e 'P.idaei' | grep -v -e '414' -e '10300'); do
ProgDir=/projects/oldhome/armita/git_repos/emr_repos/scripts/phytophthora/pathogen/hmmer
HmmModel=/projects/oldhome/armita/git_repos/emr_repos/SI_Whisson_et_al_2007/cropped.hmm
Strain=$(echo $Secretome | rev | cut -f2 -d '/' | rev)
Organism=$(echo $Secretome | rev | cut -f3 -d '/' | rev)
OutDir=analysis/RxLR_effectors/hmmer_RxLR/$Organism/$Strain
mkdir -p $OutDir
HmmResults="$Strain"_ORF_RxLR_hmmer_unmerged.txt
hmmsearch -T 0 $HmmModel $Secretome > $OutDir/$HmmResults
echo "$Organism $Strain"
cat $OutDir/$HmmResults | grep 'Initial search space'
cat $OutDir/$HmmResults | grep 'number of targets reported over threshold'
HmmFasta="$Strain"_ORF_RxLR_hmmer.fa
$ProgDir/hmmer2fasta.pl $OutDir/$HmmResults $Secretome > $OutDir/$HmmFasta
Headers="$Strain"_ORF_RxLR_hmmer_headers_unmerged.txt
cat $OutDir/$HmmFasta | grep '>' | cut -f1 | tr -d '>' | sed -r 's/\.t.*//' | tr -d ' ' > $OutDir/$Headers
SigP_Gff=gene_pred/combined_sigP_ORF/$Organism/$Strain/"$Strain"_all_secreted_unmerged.gff
ProgDir=/projects/oldhome/armita/git_repos/emr_repos/tools/seq_tools/feature_annotation
$ProgDir/gene_list_to_gff.pl $OutDir/$Headers $SigP_Gff $HmmModel Name Augustus > $OutDir/"$Strain"_ORF_RxLR_hmmer_unmerged.gff3
RxLR_Merged_Gff=$OutDir/"$Strain"_ORF_RxLR_hmm_merged.gff
RxLR_Merged_txt=$OutDir/"$Strain"_ORF_RxLR_hmm_merged.txt
RxLR_Merged_AA=$OutDir/"$Strain"_ORF_RxLR_hmm_merged.aa
ProgDir=~/git_repos/emr_repos/scripts/phytophthora/pathogen/merge_gff
$ProgDir/make_gff_database.py --inp $OutDir/"$Strain"_ORF_RxLR_hmmer_unmerged.gff3 --db sigP_ORF_RxLR_hmm.db
ProgDir=/projects/oldhome/armita/git_repos/emr_repos/tools/gene_prediction/ORF_finder
$ProgDir/merge_sigP_ORFs.py --inp sigP_ORF_RxLR_hmm.db --id sigP_ORF_RxLR_hmm --out sigP_ORF_RxLR_hmm_merged.db --gff > $RxLR_Merged_Gff
cat $RxLR_Merged_Gff | grep 'transcript' | rev | cut -f1 -d '=' | rev > $RxLR_Merged_txt
ORF_fasta=$(ls gene_pred/ORF_finder/$Organism/$Strain/"$Strain".aa_cat.fa)
$ProgDir/extract_from_fasta.py --fasta $ORF_fasta --headers $RxLR_Merged_txt > $RxLR_Merged_AA
printf "Merged RxLR-EER Hmm proteins:\t"
cat $RxLR_Merged_AA | grep '>' | wc -l
done
P.cactorum 11-40
Initial search space (Z): 17518 [actual number of targets]
Domain search space (domZ): 137 [number of targets reported over threshold]
Merged RxLR-EER Hmm proteins: 124
P.cactorum 12420
Initial search space (Z): 17579 [actual number of targets]
Domain search space (domZ): 137 [number of targets reported over threshold]
Merged RxLR-EER Hmm proteins: 124
P.cactorum 15_13
Initial search space (Z): 17687 [actual number of targets]
Domain search space (domZ): 135 [number of targets reported over threshold]
Merged RxLR-EER Hmm proteins: 122
P.cactorum 15_7
Initial search space (Z): 21929 [actual number of targets]
Domain search space (domZ): 154 [number of targets reported over threshold]
Merged RxLR-EER Hmm proteins: 124
P.cactorum 17-21
Initial search space (Z): 17481 [actual number of targets]
Domain search space (domZ): 134 [number of targets reported over threshold]
Merged RxLR-EER Hmm proteins: 124
P.cactorum 2003_3
Initial search space (Z): 17630 [actual number of targets]
Domain search space (domZ): 135 [number of targets reported over threshold]
Merged RxLR-EER Hmm proteins: 122
P.cactorum 4032
Initial search space (Z): 22067 [actual number of targets]
Domain search space (domZ): 153 [number of targets reported over threshold]
Merged RxLR-EER Hmm proteins: 128
P.cactorum 4040
Initial search space (Z): 21875 [actual number of targets]
Domain search space (domZ): 153 [number of targets reported over threshold]
Merged RxLR-EER Hmm proteins: 124
P.cactorum 404
Initial search space (Z): 43020 [actual number of targets]
Domain search space (domZ): 275 [number of targets reported over threshold]
Merged RxLR-EER Hmm proteins: 127
P.cactorum 415
Initial search space (Z): 41447 [actual number of targets]
Domain search space (domZ): 306 [number of targets reported over threshold]
Merged RxLR-EER Hmm proteins: 141
P.cactorum 416
Initial search space (Z): 42503 [actual number of targets]
Domain search space (domZ): 310 [number of targets reported over threshold]
Merged RxLR-EER Hmm proteins: 143
P.cactorum 62471
Initial search space (Z): 17830 [actual number of targets]
Domain search space (domZ): 146 [number of targets reported over threshold]
Merged RxLR-EER Hmm proteins: 134
P.cactorum P295
Initial search space (Z): 21889 [actual number of targets]
Domain search space (domZ): 167 [number of targets reported over threshold]
Merged RxLR-EER Hmm proteins: 134
P.cactorum P421_v2
Initial search space (Z): 17434 [actual number of targets]
Domain search space (domZ): 139 [number of targets reported over threshold]
Merged RxLR-EER Hmm proteins: 126
P.cactorum PC13_15
Initial search space (Z): 17837 [actual number of targets]
Domain search space (domZ): 141 [number of targets reported over threshold]
Merged RxLR-EER Hmm proteins: 127
P.cactorum R36_14
Initial search space (Z): 17959 [actual number of targets]
Domain search space (domZ): 144 [number of targets reported over threshold]
Merged RxLR-EER Hmm proteins: 133
P.idaei 371
Initial search space (Z): 37207 [actual number of targets]
Domain search space (domZ): 279 [number of targets reported over threshold]
Merged RxLR-EER Hmm proteins: 132
P.idaei SCRP370
Initial search space (Z): 17254 [actual number of targets]
Domain search space (domZ): 138 [number of targets reported over threshold]
Merged RxLR-EER Hmm proteins: 128
P.idaei SCRP376
Initial search space (Z): 17208 [actual number of targets]
Domain search space (domZ): 136 [number of targets reported over threshold]
Merged RxLR-EER Hmm proteins: 126
The total ORF RxLRs are
for RegexRxLREER in $(ls analysis/RxLR_effectors/RxLR_EER_regex_finder/*/*/*_ORF_RxLR_EER_regex_merged.txt | grep -e 'P.cactorum' -e 'P.idaei' | grep -v -e '414' -e '10300'); do
Organism=$(echo $RegexRxLREER | rev | cut -d '/' -f3 | rev)
Strain=$(echo $RegexRxLREER | rev | cut -d '/' -f2 | rev)
Gff=$(ls gene_pred/ORF_finder/$Organism/$Strain/"$Strain"_ORF.gff3)
Proteome=$(ls gene_pred/ORF_finder/$Organism/$Strain/"$Strain".aa_cat.fa)
# RegexRxLRfpkm=$(ls analysis/RxLR_effectors/RxLR_EER_regex_finder/$Organism/$Strain/"$Strain"_ORF_RxLR_regex_merged_fpkm_5_headers_renamed.txt)
HmmRxLR=$(ls analysis/RxLR_effectors/hmmer_RxLR/$Organism/$Strain/"$Strain"_ORF_RxLR_hmm_merged.txt)
# echo "$Organism - $Strain"
# echo "Number of RxLR EERs identified by Regex:"
Regex=$(cat $RegexRxLREER | sort | uniq | wc -l)
# echo "Number of RxLRs identified by Regex with fpkm > 5"
# cat $RegexRxLRfpkm | sort | uniq | wc -l
# echo "Number of RxLRs identified by Hmm:"
Hmm=$(cat $HmmRxLR | sort | uniq | wc -l)
# echo "Number of RxLRs in combined dataset:"
# cat $RegexRxLREER $HmmRxLR $RegexRxLRfpkm | sort | uniq | wc -l
Combined=$(cat $RegexRxLREER $HmmRxLR | sort | uniq | wc -l)
# echo ""
# echo "Extracting RxLRs from datasets"
OutDir=analysis/RxLR_effectors/combined_evidence/$Organism/$Strain
mkdir -p $OutDir
# cat $RegexRxLREER $RegexRxLRfpkm $HmmRxLR | sort | uniq > $OutDir/"$Strain"_total_ORF_RxLR_headers.txt
cat $RegexRxLREER $HmmRxLR | sort | uniq > $OutDir/"$Strain"_total_ORF_RxLR_headers.txt
ProgDir=/projects/oldhome/armita/git_repos/emr_repos/tools/seq_tools/feature_annotation
$ProgDir/gene_list_to_gff.pl $OutDir/"$Strain"_total_ORF_RxLR_headers.txt $Gff ORF_RxLR Name Augustus > $OutDir/"$Strain"_total_ORF_RxLR.gff
# echo "Number of genes in the extracted gff file:"
Extracted=$(cat $OutDir/"$Strain"_total_ORF_RxLR.gff | grep -w 'gene' | wc -l)
printf "$Organism\t$Strain\t$Regex\t$Hmm\t$Combined\t$Extracted\n"
done
P.cactorum 11-40 160 124 176 176
P.cactorum 12420 158 124 174 174
P.cactorum 15_13 159 122 174 174
P.cactorum 15_7 158 124 174 174
P.cactorum 17-21 162 124 178 178
P.cactorum 2003_3 157 122 172 172
P.cactorum 4032 163 128 179 179
P.cactorum 4040 158 124 174 174
P.cactorum 404 168 127 185 185
P.cactorum 415 184 141 203 203
P.cactorum 416 179 143 200 200
P.cactorum 62471 175 134 189 189
P.cactorum P295 174 134 189 189
P.cactorum P421_v2 163 126 179 179
P.cactorum PC13_15 162 127 178 178
P.cactorum R36_14 169 133 185 185
P.idaei 371 189 132 206 206
P.idaei SCRP370 172 128 187 187
P.idaei SCRP376 171 126 186 186
Intersection between the coodinates of putative RxLRs from gene models and ORFs were identified to determine the total number of RxLRs predicted in these genomes.
The RxLR effectors from both Gene models and ORF finding approaches were combined into a single file.
This step was complicated by the inconsistency in downloaded gff files for gene models.
for MergeDir in $(ls -d analysis/RxLR_effectors/combined_evidence/*/* | grep -e 'P.cactorum' -e 'P.idaei' | grep -v -e '414' -e '10300'); do
Strain=$(echo "$MergeDir" | rev | cut -f1 -d '/' | rev)
Species=$(echo "$MergeDir" | rev | cut -f2 -d '/' | rev)
AugGff=$MergeDir/"$Strain"_total_RxLR.gff
AugTxt=$MergeDir/"$Strain"_total_RxLR_headers.txt
AugFa=$(ls gene_pred/codingquary/"$Species"/"$Strain"/final/final_genes_combined.pep.fasta)
ORFGff=$(ls $MergeDir/"$Strain"_total_ORF_RxLR.gff)
ORFsFa=$(ls gene_pred/ORF_finder/"$Species"/"$Strain"/"$Strain".aa_cat.fa)
ORFsTxt=$(ls $MergeDir/"$Strain"_total_ORF_RxLR_headers.txt)
ORFsInAug=$MergeDir/"$Strain"_ORFsInAug_RxLR_EER_motif_hmm.gff
AugInORFs=$MergeDir/"$Strain"_AugInORFs_RxLR_EER_motif_hmm.gff
ORFsUniq=$MergeDir/"$Strain"_ORFsUniq_RxLR_EER_motif_hmm.gff
AugUniq=$MergeDir/"$Strain"_Aug_Uniq_RxLR_EER_motif_hmm.gff
TotalRxLRsTxt=$MergeDir/"$Strain"_Total_RxLR_EER_motif_hmm.txt
TotalRxLRsGff=$MergeDir/"$Strain"_Total_RxLR_EER_motif_hmm.gff
bedtools intersect -wa -u -a $ORFGff -b $AugGff > $ORFsInAug
bedtools intersect -wa -u -a $AugGff -b $ORFGff > $AugInORFs
bedtools intersect -v -wa -a $ORFGff -b $AugGff > $ORFsUniq
bedtools intersect -v -wa -a $AugGff -b $ORFGff > $AugUniq
# echo "$Species - $Strain"
# echo "The number of ORF RxLRs overlapping Augustus RxLRs:"
CountORFsInAug=$(cat $ORFsInAug | grep -w 'gene' | wc -l)
# echo "The number of Augustus RxLRs overlapping ORF RxLRs:"
CountAugInORFs=$(cat $AugInORFs | grep -w 'gene' | wc -l)
# echo "The number of RxLRs unique to ORF models:"
CountORFsUniq=$(cat $ORFsUniq | grep -w 'transcript' | cut -f9 | cut -f1 -d ';' | cut -f2 -d '=' | wc -l)
# cat $ORFsUniq | grep -w 'transcript' | cut -f9 | cut -f1 -d ';' | cut -f2 -d '=' >> $TotalRxLRsTxt
# echo "The number of RxLRs unique to Augustus models:"
CountAugUniq=$(cat $AugUniq | grep -w -e 'transcript' -e 'mRNA' | wc -l)
# echo "The total number of putative RxLRs are:"
cat $AugInORFs | grep -w -e 'transcript' -e 'mRNA' | cut -f9 | cut -f1 -d ';' | cut -f2 -d '=' > $TotalRxLRsTxt
cat $AugUniq | grep -w -e 'transcript' -e 'mRNA' | cut -f9 | cut -f1 -d ';' | cut -f2 -d '=' >> $TotalRxLRsTxt
cat $ORFsUniq | grep -w -e 'transcript' -e 'mRNA' | cut -f9 | cut -f3 -d ';' | cut -f2 -d '=' >> $TotalRxLRsTxt
CountTotalRxLR=$(cat $TotalRxLRsTxt | wc -l)
cat $AugInORFs $AugUniq $ORFsUniq | grep -w -f $TotalRxLRsTxt > $TotalRxLRsGff
RxLRsFa=$MergeDir/"$Strain"_final_RxLR_EER.fa
ProgDir=~/git_repos/emr_repos/tools/seq_tools/feature_annotation
# $ProgDir/unwrap_fasta.py --inp_fasta $AugFa | grep -A1 -w -f $AugTxt | grep -v -E '^--$' > $RxLRsFa
ProgDir=/projects/oldhome/armita/git_repos/emr_repos/tools/gene_prediction/ORF_finder
$ProgDir/extract_from_fasta.py --fasta $AugFa --headers $TotalRxLRsTxt > $RxLRsFa
# echo "$Strain"
$ProgDir/extract_from_fasta.py --fasta $ORFsFa --headers $TotalRxLRsTxt >> $RxLRsFa
# echo "$Strain"
# echo "The number of sequences extracted is"
CountExtractedRxLRs=$(cat $RxLRsFa | grep '>' | wc -l)
printf "$Species\t$Strain\t$CountORFsInAug\t$CountAugInORFs\t$CountORFsUniq\t$CountAugUniq\t$CountTotalRxLR\t$CountExtractedRxLRs\n"
done
P.cactorum 11-40 129 129 47 26 202 202
P.cactorum 12420 124 124 50 22 197 197
P.cactorum 15_13 120 120 54 24 199 199
P.cactorum 15_7 115 115 59 23 197 197
P.cactorum 17-21 136 136 42 29 207 207
P.cactorum 2003_3 128 128 44 25 198 198
P.cactorum 4032 122 122 57 22 203 203
P.cactorum 404 129 129 56 25 213 213
P.cactorum 4040 127 127 47 24 200 200
P.cactorum 415 126 126 77 26 229 229
P.cactorum 416 129 129 71 24 226 226
P.cactorum 62471 142 142 47 26 215 215
P.cactorum P295 132 132 57 23 213 213
P.cactorum P421_v2 126 126 53 25 205 205
P.cactorum PC13_15 130 130 48 25 205 205
P.cactorum R36_14 138 138 47 30 216 216
P.idaei 371 115 115 91 18 225 225
P.idaei SCRP370 119 119 68 26 214 214
P.idaei SCRP376 118 118 68 19 206 206
A hmm model relating to crinkler domains was used to identify putative crinklers in ORF gene models. This was done with the following commands:
for Proteome in $(ls gene_pred/ORF_finder/*/*/*.aa_cat.fa | grep -e 'P.cactorum' -e 'P.idaei' | grep -v -e '414' -e '10300'); do
# Setting variables
Strain=$(echo $Proteome | rev | cut -f2 -d '/' | rev)
Organism=$(echo $Proteome | rev | cut -f3 -d '/' | rev)
OutDir=analysis/CRN_effectors/hmmer_CRN/$Organism/$Strain
mkdir -p $OutDir
# Hmmer variables
ProgDir=/projects/oldhome/armita/git_repos/emr_repos/scripts/phytophthora/pathogen/hmmer
HmmDir=/projects/oldhome/groups/harrisonlab/project_files/idris/analysis/CRN_effectors/hmmer_models
# Searches for LFLAK domain
LFLAK_hmm=$HmmDir/Pinf_Pram_Psoj_Pcap_LFLAK.hmm
HmmResultsLFLAK="$Strain"_ORF_CRN_LFLAK_unmerged_hmmer.txt
hmmsearch -T 0 $LFLAK_hmm $Proteome > $OutDir/$HmmResultsLFLAK
echo "Searching for LFLAK domains in: $Organism $Strain"
cat $OutDir/$HmmResultsLFLAK | grep 'Initial search space'
cat $OutDir/$HmmResultsLFLAK | grep 'number of targets reported over threshold'
HmmFastaLFLAK="$Strain"_ORF_CRN_LFLAK_unmerged_hmmer.fa
$ProgDir/hmmer2fasta.pl $OutDir/$HmmResultsLFLAK $Proteome > $OutDir/$HmmFastaLFLAK
# Searches for DWL domain
DWL_hmm=$HmmDir/Pinf_Pram_Psoj_Pcap_DWL.hmm
HmmResultsDWL="$Strain"_ORF_CRN_DWL_unmerged_hmmer.txt
hmmsearch -T 0 $DWL_hmm $Proteome > $OutDir/$HmmResultsDWL
echo "Searching for DWL domains in: $Organism $Strain"
cat $OutDir/$HmmResultsDWL | grep 'Initial search space'
cat $OutDir/$HmmResultsDWL | grep 'number of targets reported over threshold'
HmmFastaDWL="$Strain"_ORF_CRN_DWL_unmerged_hmmer.fa
$ProgDir/hmmer2fasta.pl $OutDir/$HmmResultsDWL $Proteome > $OutDir/$HmmFastaDWL
# Identify ORFs found by both models
CommonHeaders=$OutDir/"$Strain"_ORF_CRN_DWL_LFLAK_unmerged_headers.txt
cat $OutDir/$HmmFastaLFLAK $OutDir/$HmmFastaDWL | grep '>' | cut -f1 | tr -d '>' | sort | uniq -d > $CommonHeaders
echo "The number of CRNs common to both models are:"
cat $CommonHeaders | wc -l
# The sequences will be merged based upon the strength of their DWL domain score
# For this reason headers as they appear in the DWL fasta file were extracted
Headers=$OutDir/"$Strain"_CRN_hmmer_unmerged_headers.txt
cat $OutDir/$HmmFastaDWL | grep '>' | grep -w -f $CommonHeaders | tr -d '>' | sed -r 's/\s+/\t/g'| sed 's/=\t/=/g' | tr -d '-' | sed 's/hmm_score/HMM_score/g' > $Headers
# As we are dealing with JGI and Broad sequences, some features need formatting:
ORF_Gff=$(ls gene_pred/ORF_finder/$Organism/$Strain/*_ORF.gff3 | grep -v '_atg_')
# Gff features were extracted for each header
CRN_unmerged_Gff=$OutDir/"$Strain"_CRN_unmerged_hmmer.gff3
# cat $Headers | cut -f1 > tmp.txt
ProgDir=/projects/oldhome/armita/git_repos/emr_repos/tools/gene_prediction/ORF_finder
$ProgDir/extract_gff_for_sigP_hits.pl $Headers $ORF_Gff CRN_HMM Name > $CRN_unmerged_Gff
# $ProgDir/extract_gff_for_sigP_hits.pl tmp.txt $ORF_Gff CRN_HMM Name > $CRN_unmerged_Gff
# Gff features were merged based upon the DWL hmm score
DbDir=analysis/databases/$Organism/$Strain
mkdir -p $DbDir
ProgDir=~/git_repos/emr_repos/scripts/phytophthora/pathogen/merge_gff
$ProgDir/make_gff_database.py --inp $CRN_unmerged_Gff --db $DbDir/CRN_ORF.db
CRN_Merged_Gff=$OutDir/"$Strain"_CRN_merged_hmmer.gff3
ProgDir=/projects/oldhome/armita/git_repos/emr_repos/tools/gene_prediction/ORF_finder
$ProgDir/merge_sigP_ORFs.py --inp $DbDir/CRN_ORF.db --id LFLAK_DWL_CRN --out $DbDir/CRN_ORF_merged.db --gff > $CRN_Merged_Gff
# Final results are reported:
echo "Number of CRN ORFs after merging:"
cat $CRN_Merged_Gff | grep 'gene' | wc -l
done
Searching for LFLAK domains in: P.cactorum 11-40
Initial search space (Z): 480478 [actual number of targets]
Domain search space (domZ): 237 [number of targets reported over threshold]
Searching for DWL domains in: P.cactorum 11-40
Initial search space (Z): 480478 [actual number of targets]
Domain search space (domZ): 288 [number of targets reported over threshold]
The number of CRNs common to both models are:
139
Number of CRN ORFs after merging:
94
Searching for LFLAK domains in: P.cactorum 12420
Initial search space (Z): 481792 [actual number of targets]
Domain search space (domZ): 252 [number of targets reported over threshold]
Searching for DWL domains in: P.cactorum 12420
Initial search space (Z): 481792 [actual number of targets]
Domain search space (domZ): 308 [number of targets reported over threshold]
The number of CRNs common to both models are:
148
Number of CRN ORFs after merging:
104
Searching for LFLAK domains in: P.cactorum 15_13
Initial search space (Z): 492365 [actual number of targets]
Domain search space (domZ): 256 [number of targets reported over threshold]
Searching for DWL domains in: P.cactorum 15_13
Initial search space (Z): 492365 [actual number of targets]
Domain search space (domZ): 322 [number of targets reported over threshold]
The number of CRNs common to both models are:
159
Number of CRN ORFs after merging:
106
Searching for LFLAK domains in: P.cactorum 15_7
Initial search space (Z): 493630 [actual number of targets]
Domain search space (domZ): 252 [number of targets reported over threshold]
Searching for DWL domains in: P.cactorum 15_7
Initial search space (Z): 493630 [actual number of targets]
Domain search space (domZ): 314 [number of targets reported over threshold]
The number of CRNs common to both models are:
150
Number of CRN ORFs after merging:
105
Searching for LFLAK domains in: P.cactorum 17-21
Initial search space (Z): 480195 [actual number of targets]
Domain search space (domZ): 226 [number of targets reported over threshold]
Searching for DWL domains in: P.cactorum 17-21
Initial search space (Z): 480195 [actual number of targets]
Domain search space (domZ): 283 [number of targets reported over threshold]
The number of CRNs common to both models are:
135
Number of CRN ORFs after merging:
91
Searching for LFLAK domains in: P.cactorum 2003_3
Initial search space (Z): 492609 [actual number of targets]
Domain search space (domZ): 252 [number of targets reported over threshold]
Searching for DWL domains in: P.cactorum 2003_3
Initial search space (Z): 492609 [actual number of targets]
Domain search space (domZ): 309 [number of targets reported over threshold]
The number of CRNs common to both models are:
150
Number of CRN ORFs after merging:
103
Searching for LFLAK domains in: P.cactorum 4032
Initial search space (Z): 497333 [actual number of targets]
Domain search space (domZ): 253 [number of targets reported over threshold]
Searching for DWL domains in: P.cactorum 4032
Initial search space (Z): 497333 [actual number of targets]
Domain search space (domZ): 330 [number of targets reported over threshold]
The number of CRNs common to both models are:
157
Number of CRN ORFs after merging:
110
Searching for LFLAK domains in: P.cactorum 4040
Initial search space (Z): 490721 [actual number of targets]
Domain search space (domZ): 257 [number of targets reported over threshold]
Searching for DWL domains in: P.cactorum 4040
Initial search space (Z): 490721 [actual number of targets]
Domain search space (domZ): 320 [number of targets reported over threshold]
The number of CRNs common to both models are:
156
Number of CRN ORFs after merging:
107
Searching for LFLAK domains in: P.cactorum 404
Initial search space (Z): 560045 [actual number of targets]
Domain search space (domZ): 248 [number of targets reported over threshold]
Searching for DWL domains in: P.cactorum 404
Initial search space (Z): 560045 [actual number of targets]
Domain search space (domZ): 303 [number of targets reported over threshold]
The number of CRNs common to both models are:
147
Number of CRN ORFs after merging:
103
Searching for LFLAK domains in: P.cactorum 415
Initial search space (Z): 491086 [actual number of targets]
Domain search space (domZ): 254 [number of targets reported over threshold]
Searching for DWL domains in: P.cactorum 415
Initial search space (Z): 491086 [actual number of targets]
Domain search space (domZ): 317 [number of targets reported over threshold]
The number of CRNs common to both models are:
154
Number of CRN ORFs after merging:
109
Searching for LFLAK domains in: P.cactorum 416
Initial search space (Z): 488742 [actual number of targets]
Domain search space (domZ): 260 [number of targets reported over threshold]
Searching for DWL domains in: P.cactorum 416
Initial search space (Z): 488742 [actual number of targets]
Domain search space (domZ): 319 [number of targets reported over threshold]
The number of CRNs common to both models are:
157
Number of CRN ORFs after merging:
107
Searching for LFLAK domains in: P.cactorum 62471
Initial search space (Z): 493951 [actual number of targets]
Domain search space (domZ): 249 [number of targets reported over threshold]
Searching for DWL domains in: P.cactorum 62471
Initial search space (Z): 493951 [actual number of targets]
Domain search space (domZ): 308 [number of targets reported over threshold]
The number of CRNs common to both models are:
153
Number of CRN ORFs after merging:
100
Searching for LFLAK domains in: P.cactorum P295
Initial search space (Z): 488637 [actual number of targets]
Domain search space (domZ): 254 [number of targets reported over threshold]
Searching for DWL domains in: P.cactorum P295
Initial search space (Z): 488637 [actual number of targets]
Domain search space (domZ): 327 [number of targets reported over threshold]
The number of CRNs common to both models are:
163
Number of CRN ORFs after merging:
109
Searching for LFLAK domains in: P.cactorum P421_v2
Initial search space (Z): 477118 [actual number of targets]
Domain search space (domZ): 238 [number of targets reported over threshold]
Searching for DWL domains in: P.cactorum P421_v2
Initial search space (Z): 477118 [actual number of targets]
Domain search space (domZ): 284 [number of targets reported over threshold]
The number of CRNs common to both models are:
142
Number of CRN ORFs after merging:
96
Searching for LFLAK domains in: P.cactorum PC13_15
Initial search space (Z): 496343 [actual number of targets]
Domain search space (domZ): 251 [number of targets reported over threshold]
Searching for DWL domains in: P.cactorum PC13_15
Initial search space (Z): 496343 [actual number of targets]
Domain search space (domZ): 321 [number of targets reported over threshold]
The number of CRNs common to both models are:
156
Number of CRN ORFs after merging:
109
Searching for LFLAK domains in: P.cactorum R36_14
Initial search space (Z): 499436 [actual number of targets]
Domain search space (domZ): 260 [number of targets reported over threshold]
Searching for DWL domains in: P.cactorum R36_14
Initial search space (Z): 499436 [actual number of targets]
Domain search space (domZ): 316 [number of targets reported over threshold]
The number of CRNs common to both models are:
159
Number of CRN ORFs after merging:
107
Searching for LFLAK domains in: P.idaei 371
Initial search space (Z): 487572 [actual number of targets]
Domain search space (domZ): 217 [number of targets reported over threshold]
Searching for DWL domains in: P.idaei 371
Initial search space (Z): 487572 [actual number of targets]
Domain search space (domZ): 254 [number of targets reported over threshold]
The number of CRNs common to both models are:
122
Number of CRN ORFs after merging:
86
Searching for LFLAK domains in: P.idaei SCRP370
Initial search space (Z): 487017 [actual number of targets]
Domain search space (domZ): 220 [number of targets reported over threshold]
Searching for DWL domains in: P.idaei SCRP370
Initial search space (Z): 487017 [actual number of targets]
Domain search space (domZ): 256 [number of targets reported over threshold]
The number of CRNs common to both models are:
125
Number of CRN ORFs after merging:
89
Searching for LFLAK domains in: P.idaei SCRP376
Initial search space (Z): 487331 [actual number of targets]
Domain search space (domZ): 217 [number of targets reported over threshold]
Searching for DWL domains in: P.idaei SCRP376
Initial search space (Z): 487331 [actual number of targets]
Domain search space (domZ): 260 [number of targets reported over threshold]
The number of CRNs common to both models are:
124
Number of CRN ORFs after merging:
89
Extract crinklers from ORFs and Braker/codingquary gene models
for MergeDir in $(ls -d analysis/CRN_effectors/hmmer_CRN/*/* | grep -e 'P.cactorum' -e 'P.idaei' | grep -v -e '414' -e '10300'); do
Strain=$(echo "$MergeDir" | rev | cut -f1 -d '/' | rev)
Species=$(echo "$MergeDir" | rev | cut -f2 -d '/' | rev)
AugGff=$(ls $MergeDir/"$Strain"_pub_CRN_LFLAK_DWL.gff)
AugFa=$(ls gene_pred/*/"$Species"/"$Strain"/final/final_genes_combined.pep.fasta)
ORFsFa=$(ls gene_pred/ORF_finder/"$Species"/"$Strain"/"$Strain".aa_cat.fa)
ORFGff=$MergeDir/"$Strain"_CRN_merged_hmmer.gff3
ORFsInAug=$MergeDir/"$Strain"_ORFsInAug_CRN_hmmer.bed
AugInORFs=$MergeDir/"$Strain"_AugInORFs_CRN_hmmer.bed
ORFsUniq=$MergeDir/"$Strain"_ORFsUniq_CRN_hmmer.bed
AugUniq=$MergeDir/"$Strain"_Aug_Uniq_CRN_hmmer.bed
TotalCRNsTxt=$MergeDir/"$Strain"_final_CRN.txt
TotalCRNsGff=$MergeDir/"$Strain"_final_CRN.gff
TotalCRNsHeaders=$MergeDir/"$Strain"_Total_CRN_headers.txt
bedtools intersect -wa -u -a $ORFGff -b $AugGff > $ORFsInAug
bedtools intersect -wa -u -a $AugGff -b $ORFGff > $AugInORFs
bedtools intersect -v -wa -a $ORFGff -b $AugGff > $ORFsUniq
bedtools intersect -v -wa -a $AugGff -b $ORFGff > $AugUniq
# echo "$Species - $Strain"
# echo "The number of ORF CRNs overlapping Augustus CRNs:"
CountORFsInAug=$(cat $ORFsInAug | grep -w -e 'transcript' -e 'mRNA' | wc -l)
# echo "The number of Augustus CRNs overlapping ORF CRNs:"
CountAugInORFs=$(cat $AugInORFs | grep -w -e 'transcript' -e 'mRNA' | wc -l)
cat $AugInORFs | grep -w -e 'transcript' -e 'mRNA' | cut -f9 | cut -f1 -d ';' | cut -f2 -d '=' > $TotalCRNsTxt
# echo "The number of CRNs unique to ORF models:"
CountORFsUniq=$(cat $ORFsUniq | grep -w 'transcript'| grep -w -e 'transcript' -e 'mRNA' | cut -f9 | cut -f4 -d ';' | cut -f2 -d '=' | wc -l)
cat $ORFsUniq | grep -w 'transcript'| grep -w -e 'transcript' -e 'mRNA' | cut -f9 | cut -f4 -d ';' | cut -f2 -d '=' >> $TotalCRNsTxt
# echo "The number of CRNs unique to Augustus models:"
CountAugUniq=$(cat $AugUniq | grep -w -e 'transcript' -e 'mRNA' | wc -l)
cat $AugUniq | grep -w -e 'transcript' -e 'mRNA' | cut -f9 | cut -f1 -d ';' | cut -f2 -d '=' >> $TotalCRNsTxt
cat $AugInORFs $AugUniq $ORFsUniq | grep -w -f $TotalCRNsTxt > $TotalCRNsGff
CRNsFa=$MergeDir/"$Strain"_final_CRN.fa
ProgDir=/projects/oldhome/armita/git_repos/emr_repos/tools/gene_prediction/ORF_finder
$ProgDir/extract_from_fasta.py --fasta $AugFa --headers $TotalCRNsTxt > $CRNsFa
$ProgDir/extract_from_fasta.py --fasta $ORFsFa --headers $TotalCRNsTxt >> $CRNsFa
# echo "The number of sequences extracted is"
CountNumExtracted=$(cat $CRNsFa | grep '>' | wc -l)
printf "$Species\t$Strain\t$CountORFsUniq\t$CountAugInORFs\t$CountORFsUniq\t$CountAugUniq\t$CountNumExtracted\n"
done
P.cactorum 11-40 40 54 40 5 99
P.cactorum 12420 42 62 42 4 108
P.cactorum 15_13 38 68 38 4 110
P.cactorum 15_7 40 67 40 4 111
P.cactorum 17-21 42 51 42 8 101
P.cactorum 2003_3 48 55 48 6 109
P.cactorum 4032 37 74 37 3 114
P.cactorum 404 57 49 57 4 110
P.cactorum 4040 41 66 41 7 114
P.cactorum 415 39 71 39 4 114
P.cactorum 416 41 70 41 8 119
P.cactorum 62471 48 52 48 9 109
P.cactorum P295 44 65 44 8 117
P.cactorum P421_v2 45 53 45 6 104
P.cactorum PC13_15 40 70 40 4 114
P.cactorum R36_14 37 70 37 11 118
P.idaei 371 46 41 46 1 88
P.idaei SCRP370 60 30 60 0 90
P.idaei SCRP376 63 26 63 0 89
A gff file containing the combined Braker and CodingQuary genes as well as the additional CRN and RxLR genes predicted by ORF analysis was made.
for GeneGff in $(ls gene_pred/*/P*/*/final/final_genes_appended.gff3 | grep -e 'P.cactorum' -e 'P.idaei' | grep -v -e '414' -e '10300'); do
Strain=$(echo $GeneGff | rev | cut -f3 -d '/' | rev)
Organism=$(echo $GeneGff | rev | cut -f4 -d '/' | rev)
echo "$Organism - $Strain"
GffOrfRxLR=$(ls analysis/RxLR_effectors/combined_evidence/$Organism/$Strain/*_ORFsUniq_RxLR_EER_motif_hmm.gff)
GffOrfCRN=$(ls analysis/CRN_effectors/hmmer_CRN/$Organism/$Strain/*_ORFsUniq_CRN_hmmer.bed)
Assembly=$(ls repeat_masked/$Organism/$Strain/filtered_contigs_repmask/*_contigs_softmasked_repeatmasker_TPSI_appended.fa)
OutDir=gene_pred/final_incl_ORF/$Organism/$Strain
mkdir -p $OutDir
cat $GeneGff > $OutDir/${Strain}_genes_incl_ORFeffectors.gff3
ProgDir=/projects/oldhome/armita/git_repos/emr_repos/tools/gene_prediction/ORF_finder
$ProgDir/add_ORF_features.pl $GffOrfRxLR $Assembly >> $OutDir/${Strain}_genes_incl_ORFeffectors.gff3
$ProgDir/add_ORF_features.pl $GffOrfCRN $Assembly >> $OutDir/${Strain}_genes_incl_ORFeffectors.gff3
# Make gene models from gff files.
ProgDir=/projects/oldhome/armita/git_repos/emr_repos/tools/gene_prediction/codingquary
# Assembly=$(ls repeat_masked/$Organism/$Strain/*/*_contigs_unmasked_wrapped.fa)
$ProgDir/gff2fasta.pl $Assembly $OutDir/${Strain}_genes_incl_ORFeffectors.gff3 $OutDir/${Strain}_genes_incl_ORFeffectors
done
High confidence genes overlapped by ORFs to be inserted were manually inspected to see whether the original gene model or the ORF should be kept. Manual inspection. indicated that predicted gene models were, in general, represented the true gene. As this approach was to be used for 20 genomes, not all conflicts were assessed manually. Bedtools was used to identify which genes were intersected:
for GeneGff in $(ls gene_pred/*/P*/*/final/final_genes_appended.gff3 | grep -e 'P.cactorum' -e 'P.idaei' | grep -v -e '414' -e '10300'); do
Strain=$(echo $GeneGff | rev | cut -f3 -d '/' | rev)
Organism=$(echo $GeneGff | rev | cut -f4 -d '/' | rev)
echo "$Organism - $Strain"
OutDir=gene_pred/final_incl_ORF/$Organism/$Strain
GffOrfRxLR=$(ls analysis/RxLR_effectors/combined_evidence/$Organism/$Strain/${Strain}_ORFsUniq_RxLR_EER_motif_hmm.gff)
GffOrfCRN=$(ls analysis/CRN_effectors/hmmer_CRN/$Organism/$Strain/${Strain}_ORFsUniq_CRN_hmmer.bed)
Assembly=$(ls repeat_masked/$Organism/$Strain/filtered_contigs_repmask/*_contigs_softmasked_repeatmasker_TPSI_appended.fa)
# Identify RxLR ORFs intersecting non-RxLR gene models
bedtools intersect -wo -a $GeneGff -b $GffOrfRxLR | grep -e "AUGUSTUS.gene" -e "CodingQuarry_v2.0.gene" -e 'PGNCodingQuarry_v2.0' | grep "ORF_RxLR.gene"| cut -f1,9,18 | sed 's/ID=//g' | tr -d ';' > $OutDir/RxLR_ORFs_intersecting_non-RxLR_genes.txt
# Identify CRN ORFs intersecting non-CRN gene models
bedtools intersect -wo -a $GeneGff -b $GffOrfCRN | grep -e "AUGUSTUS.gene" -e "CodingQuarry_v2.0.gene" -e 'PGNCodingQuarry_v2.0' | grep "CRN_HMM.gene"| cut -f1,9,18 | sed 's/ID=//g' | tr -d ';' > $OutDir/CRN_ORFs_intersecting_non-CRN_genes.txt
done
Some ORF effectors were noted to overlap AugustusCodingQuarry gene models. These were manually inspected in geneious and the following genes removed:
for CombinedGff in $(ls gene_pred/final_incl_ORF/*/*/*_genes_incl_ORFeffectors.gff3 | grep -e 'P.cactorum' -e 'P.idaei' | grep -v -e '414' -e '10300' | head -n1); do
Strain=$(echo $CombinedGff | rev | cut -f2 -d '/' | rev)
Organism=$(echo $CombinedGff | rev | cut -f3 -d '/' | rev)
# echo "$Organism - $Strain"
OutDir=$(dirname $CombinedGff)
cat $OutDir/RxLR_ORFs_intersecting_non-RxLR_genes.txt $OutDir/CRN_ORFs_intersecting_non-CRN_genes.txt | cut -f3 > $OutDir/exclude_list.txt
# echo "genes before filtering"
PreFilter=$(cat $CombinedGff | grep -w 'gene' | wc -l)
# echo "genes in list"
FilterList=$(cat $OutDir/exclude_list.txt | wc -l)
# echo "Unique genes in list"
UniqueFilterList=$(cat $OutDir/exclude_list.txt | sort | uniq | wc -l)
# echo "Number of genes removed"
GenesRemoved=$(cat $CombinedGff | grep -w -f $OutDir/exclude_list.txt | grep -w 'gene' | wc -l)
cat $CombinedGff | grep -w -v -f $OutDir/exclude_list.txt > $OutDir/${Strain}_genes_incl_ORFeffectors_filtered.gff3
# echo "final number of genes"
FinalGenes=$(cat $OutDir/${Strain}_genes_incl_ORFeffectors_filtered.gff3 | grep -w 'gene' | wc -l)
printf "$Organism\t$Strain\t$PreFilter\t$FilterList\t$UniqueFilterList\t$GenesRemoved\t$FinalGenes\n"
done
P.cactorum 11-40 28107 62 61 61 28046
P.cactorum 12420 25498 55 54 54 25444
P.cactorum 15_13 25996 54 54 54 25942
P.cactorum 15_7 25909 55 54 54 25855
P.cactorum 17-21 27926 58 57 57 27869
P.cactorum 2003_3 29014 66 64 64 28950
P.cactorum 4032 26113 52 51 51 26062
P.cactorum 4040 28727 58 57 57 28670
P.cactorum 404 35041 68 63 63 34978
P.cactorum 415 26102 61 60 60 26042
P.cactorum 416 28697 76 74 74 28623
P.cactorum 62471 28834 68 67 67 28767
P.cactorum P295 25684 57 57 57 25627
P.cactorum P421_v2 25629 63 63 63 25566
P.cactorum PC13_15 25821 50 50 50 25771
P.cactorum R36_14 29178 55 54 54 29124
P.idaei 371 24928 73 72 72 24856
P.idaei SCRP370 27411 57 57 57 27354
P.idaei SCRP376 24986 62 62 62 24924
for GffAppended in $(ls gene_pred/final_incl_ORF/*/*/*_genes_incl_ORFeffectors_filtered.gff3 | grep -e 'P.cactorum' -e 'P.idaei' | grep -v -e '414' -e '10300' | head -n1); do
Strain=$(echo $GffAppended | rev | cut -d '/' -f2 | rev)
Organism=$(echo $GffAppended | rev | cut -d '/' -f3 | rev)
echo "$Organism - $Strain"
FinalDir=$(dirname $GffAppended)
GffFiltered=$FinalDir/filtered_duplicates.gff
ProgDir=/projects/oldhome/armita/git_repos/emr_repos/tools/gene_prediction/codingquary
$ProgDir/remove_dup_features.py --inp_gff $GffAppended --out_gff $GffFiltered
GffRenamed=$FinalDir/final_genes_genes_incl_ORFeffectors_renamed.gff3
LogFile=$FinalDir/final_genes_appended_renamed.log
ProgDir=/projects/oldhome/armita/git_repos/emr_repos/tools/gene_prediction/codingquary
$ProgDir/gff_rename_genes.py --inp_gff $GffFiltered --conversion_log $LogFile > $GffRenamed
rm $GffFiltered
Assembly=$(ls repeat_masked/$Organism/$Strain/filtered_contigs_repmask/*_contigs_softmasked_repeatmasker_TPSI_appended.fa)
$ProgDir/gff2fasta.pl $Assembly $GffRenamed $FinalDir/final_genes_genes_incl_ORFeffectors_renamed
# The proteins fasta file contains * instead of Xs for stop codons, these should
# be changed
sed -i 's/\*/X/g' $FinalDir/final_genes_genes_incl_ORFeffectors_renamed.pep.fasta
done
for Transcriptome in $(ls gene_pred/final_incl_ORF/*/*/final_genes_genes_incl_ORFeffectors_renamed.pep.fasta | grep -e 'P.cactorum' -e 'P.idaei' | grep -v -e '414' -e '10300'); do
Strain=$(echo $Transcriptome| rev | cut -d '/' -f2 | rev)
Organism=$(echo $Transcriptome | rev | cut -d '/' -f3 | rev)
GffFile=$(echo $Transcriptome | sed 's/.pep.fasta/.gff3/g')
Genes=$(cat $GffFile | grep -w 'gene' | wc -l)
Proteins=$(cat $Transcriptome | grep '>' | wc -l)
printf "$Organism\t$Strain\t$Genes\t$Proteins\n"
done
for Transcriptome in $(ls gene_pred/final_incl_ORF/*/*/final_genes_genes_incl_ORFeffectors_renamed.gene.fasta | grep -e 'P.cactorum' -e 'P.idaei' | grep -v -e '414' -e '10300'); do
Strain=$(echo $Transcriptome| rev | cut -d '/' -f2 | rev)
Organism=$(echo $Transcriptome | rev | cut -d '/' -f3 | rev)
echo "$Organism - $Strain"
ProgDir=/projects/oldhome/armita/git_repos/emr_repos/tools/gene_prediction/busco
# BuscoDB="Fungal"
BuscoDB=$(ls -d /projects/oldhome/groups/harrisonlab/dbBusco/eukaryota_odb9)
OutDir=gene_pred/busco/$Organism/$Strain/genes
qsub $ProgDir/sub_busco3.sh $Transcriptome $BuscoDB $OutDir
done
INFO 281 Complete BUSCOs (C)
INFO 272 Complete and single-copy BUSCOs (S)
INFO 9 Complete and duplicated BUSCOs (D)
INFO 7 Fragmented BUSCOs (F)
INFO 15 Missing BUSCOs (M)
INFO 303 Total BUSCO groups searched
for File in $(ls gene_pred/busco/*/*/genes/*/short_summary_*.txt); do
Strain=$(echo $File| rev | cut -d '/' -f4 | rev)
Organism=$(echo $File | rev | cut -d '/' -f5 | rev)
Complete=$(cat $File | grep "(C)" | cut -f2)
Single=$(cat $File | grep "(S)" | cut -f2)
Fragmented=$(cat $File | grep "(F)" | cut -f2)
Missing=$(cat $File | grep "(M)" | cut -f2)
Total=$(cat $File | grep "Total" | cut -f2)
echo -e "$Organism\t$Strain\t$Complete\t$Single\t$Fragmented\t$Missing\t$Total"
done
Interproscan was used to give gene models functional annotations. Annotation was run using the commands below:
Note: This is a long-running script. As such, these commands were run using 'screen' to allow jobs to be submitted and monitored in the background. This allows the session to be disconnected and reconnected over time.
Screen ouput detailing the progress of submission of interporscan jobs was redirected to a temporary output file named interproscan_submission.log .
screen -a
# cd /projects/oldhome/groups/harrisonlab/project_files/idris
ProgDir=/projects/oldhome/armita/git_repos/emr_repos/tools/seq_tools/feature_annotation/interproscan
for Genes in $(ls gene_pred/final_incl_ORF/*/*/final_genes_genes_incl_ORFeffectors_renamed.pep.fasta | grep -e 'P.cactorum' -e 'P.idaei' | grep -v -e '414' -e '10300'); do
FileName=$(basename $Genes)
SymLinkDir=$(dirname $Genes)/symlink_directory
mkdir -p $SymLinkDir
cp -s $PWD/$Genes $SymLinkDir/$FileName
echo $Genes
$ProgDir/sub_interproscan.sh $SymLinkDir/$FileName
done 2>&1 | tee -a interproscan_submisison.log
Following interproscan annotation split files were combined using the following commands:
ProgDir=/projects/oldhome/armita/git_repos/emr_repos/tools/seq_tools/feature_annotation/interproscan
for Proteome in $(ls gene_pred/final_incl_ORF/*/*/symlink_directory/final_genes_genes_incl_ORFeffectors_renamed.pep.fasta | grep -e 'P.cactorum' -e 'P.idaei' | grep -v -e '414' -e '10300' | grep -v '11-40'); do
Strain=$(echo $Proteome | rev | cut -d '/' -f3 | rev)
Organism=$(echo $Proteome | rev | cut -d '/' -f4 | rev)
echo "$Organism - $Strain"
InterProRaw=gene_pred/interproscan/$Organism/$Strain/raw
$ProgDir/append_interpro.sh $Proteome $InterProRaw
done
for Proteome in $(ls gene_pred/final_incl_ORF/*/*/final_genes_genes_incl_ORFeffectors_renamed.pep.fasta | grep -e 'P.cactorum' -e 'P.idaei' | grep -v -e '414' -e '10300'); do
Strain=$(echo $Proteome | rev | cut -f2 -d '/' | rev)
Organism=$(echo $Proteome | rev | cut -f3 -d '/' | rev)
# Jobs=$(qstat | grep 'sub_swiss' | grep 'qw' | wc -l)
# while [ $Jobs -gt 1 ]; do
# sleep 1m
# printf "."
# Jobs=$(qstat | grep 'sub_swiss' | grep 'qw' | wc -l)
# done
OutDir=gene_pred/swissprot/$Organism/$Strain
SwissDbDir=../../uniprot/swissprot
SwissDbName=uniprot_sprot
ProgDir=/projects/oldhome/armita/git_repos/emr_repos/tools/seq_tools/feature_annotation/swissprot
qsub $ProgDir/sub_swissprot.sh $Proteome $OutDir $SwissDbDir $SwissDbName
done
A list of PFAM domains, superfamily annotations used as part of the DBD database and a further set of interproscan annotations listed by Shelest et al 2017 were made http://www.transcriptionfactor.org/index.cgi?Domain+domain:all https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5415576/
for Interpro in $(ls gene_pred/interproscan/*/*/*_interproscan.tsv | grep -e 'P.cactorum' -e 'P.idaei' | grep -v -e '414' -e '10300' | grep -w '404'); do
Organism=$(echo $Interpro | rev | cut -f3 -d '/' | rev)
Strain=$(echo $Interpro | rev | cut -f2 -d '/' | rev)
# echo "$Organism - $Strain"
OutDir=analysis/transcription_factors/$Organism/$Strain
mkdir -p $OutDir
ProgDir=/projects/oldhome/armita/git_repos/emr_repos/tools/seq_tools/feature_annotation/transcription_factors
$ProgDir/interpro2TFs.py --InterPro $Interpro > $OutDir/"$Strain"_TF_domains.tsv
# echo "total number of transcription factors"
cat $OutDir/"$Strain"_TF_domains.tsv | cut -f1 | sort | uniq > $OutDir/"$Strain"_TF_gene_headers.txt
NumTF=$(cat $OutDir/"$Strain"_TF_gene_headers.txt | wc -l)
printf "$Organism\t$Strain\t$NumTF\n"
done
P.cactorum 11-40 968
P.cactorum 12420 904
P.cactorum 15_13 913
P.cactorum 15_7 921
P.cactorum 17-21 952
P.cactorum 2003_3 1011
P.cactorum 4032 924
P.cactorum 4040 978
P.cactorum 404 1064
P.cactorum 415 946
P.cactorum 416 1007
P.cactorum 62471 1003
P.cactorum P295 928
P.cactorum P421_v2 861
P.cactorum PC13_15 932
P.cactorum R36_14 1038
P.idaei 371 884
P.idaei SCRP370 905
P.idaei SCRP376 862
Required programs:
- EffectorP.py
for Proteome in $(ls gene_pred/final_incl_ORF/*/*/final_genes_genes_incl_ORFeffectors_renamed.pep.fasta | grep -e 'P.cactorum' -e 'P.idaei' | grep -v -e '414' -e '10300'); do
Strain=$(echo $Proteome | rev | cut -f2 -d '/' | rev)
Organism=$(echo $Proteome | rev | cut -f3 -d '/' | rev)
echo "$Organism - $Strain"
BaseName="$Organism"_"$Strain"_EffectorP
OutDir=analysis/effectorP/$Organism/$Strain
ProgDir=~/git_repos/emr_repos/tools/seq_tools/feature_annotation/fungal_effectors
qsub $ProgDir/pred_effectorP.sh $Proteome $BaseName $OutDir
done
for File in $(ls analysis/effectorP/*/*/*_EffectorP.txt | grep -e 'P.cactorum' -e 'P.idaei' | grep -v -e '414' -e '10300'); do
Strain=$(echo $File | rev | cut -f2 -d '/' | rev)
Organism=$(echo $File | rev | cut -f3 -d '/' | rev)
# echo "$Organism - $Strain"
Headers=$(echo "$File" | sed 's/_EffectorP.txt/_EffectorP_headers.txt/g')
cat $File | grep 'Effector' | cut -f1 > $Headers
# printf "EffectorP headers:\t"
NumEffP=$(cat $Headers | wc -l)
printf "$Organism\t$Strain\t$NumEffP\n"
done
P.cactorum 11-40 19366
P.cactorum 12420 16432
P.cactorum 15_13 16682
P.cactorum 15_7 16710
P.cactorum 17-21 18894
P.cactorum 2003_3 20214
P.cactorum 4032 16916
P.cactorum 4040 20022
P.cactorum 404 26056
P.cactorum 415 17082
P.cactorum 416 19876
P.cactorum 62471 20048
P.cactorum P295 16578
P.cactorum P421_v2 16386
P.cactorum PC13_15 16352
P.cactorum R36_14 20220
P.idaei 371 15628
P.idaei SCRP370 19020
P.idaei SCRP376 16182
Carbohydrte active enzymes were idnetified using CAZYfollowing recomendations at http://csbl.bmb.uga.edu/dbCAN/download/readme.txt :
for Proteome in $(ls gene_pred/final_incl_ORF/*/*/final_genes_genes_incl_ORFeffectors_renamed.pep.fasta | grep -e 'P.cactorum' -e 'P.idaei' | grep -v -e '414' -e '10300'); do
Strain=$(echo $Proteome | rev | cut -f2 -d '/' | rev)
Organism=$(echo $Proteome | rev | cut -f3 -d '/' | rev)
echo "$Organism - $Strain"
OutDir=gene_pred/CAZY/$Organism/$Strain
mkdir -p $OutDir
Prefix="$Strain"_CAZY
CazyHmm=../../dbCAN/dbCAN-fam-HMMs.txt
ProgDir=/projects/oldhome/armita/git_repos/emr_repos/tools/seq_tools/feature_annotation/HMMER
qsub $ProgDir/sub_hmmscan.sh $CazyHmm $Proteome $Prefix $OutDir
done
The Hmm parser was used to filter hits by an E-value of E1x10-5 or E 1x10-e3 if they had a hit over a length of X %.
Those proteins with a signal peptide were extracted from the list and gff files representing these proteins made.
for File in $(ls gene_pred/CAZY/*/*/*CAZY.out.dm | grep -e 'P.cactorum' -e 'P.idaei' | grep -v -e '414' -e '10300'); do
Strain=$(echo $File | rev | cut -f2 -d '/' | rev)
Organism=$(echo $File | rev | cut -f3 -d '/' | rev)
OutDir=$(dirname $File)
# echo "$Organism - $Strain"
ProgDir=/projects/oldhome/groups/harrisonlab/dbCAN
$ProgDir/hmmscan-parser.sh $OutDir/"$Strain"_CAZY.out.dm > $OutDir/"$Strain"_CAZY.out.dm.ps
CazyHeaders=$(echo $File | sed 's/.out.dm/_headers.txt/g')
cat $OutDir/"$Strain"_CAZY.out.dm.ps | cut -f3 | sort | uniq > $CazyHeaders
# printf "number of CAZY genes identified:\t"
CazymeNum=$(cat $CazyHeaders | wc -l)
printf "$Organism\t$Strain\t$CazymeNum\n"
Gff=$(ls gene_pred/final_incl_ORF/$Organism/$Strain/final_genes_genes_incl_ORFeffectors_renamed.gff3)
CazyGff=$OutDir/"$Strain"_CAZY.gff
ProgDir=/projects/oldhome/armita/git_repos/emr_repos/tools/gene_prediction/ORF_finder
$ProgDir/extract_gff_for_sigP_hits.pl $CazyHeaders $Gff CAZyme ID > $CazyGff
# SecretedProts=$(ls gene_pred/combined_sigP/$Organism/$Strain/"$Strain"_secreted.fa)
# # SecretedProts=$(ls gene_pred/combined_sigP/$Organism/$Strain/"$Strain"_final_sp_no_trans_mem_no_GPI.aa)
# SecretedHeaders=$(echo $SecretedProts | sed 's/.fa/_headers.txt/g' | sed 's/.aa/_headers.txt/g')
# cat $SecretedProts | grep '>' | tr -d '>' > $SecretedHeaders
# CazyGffSecreted=$OutDir/"$Strain"_CAZY_secreted.gff
# $ProgDir/extract_gff_for_sigP_hits.pl $SecretedHeaders $CazyGff Secreted_CAZyme ID > $CazyGffSecreted
# printf "number of Secreted CAZY genes identified:\t"
# cat $CazyGffSecreted | grep -w 'mRNA' | cut -f9 | tr -d 'ID=' | cut -f1 -d ';' > $OutDir/"$Strain"_CAZY_secreted_headers.txt
# cat $OutDir/"$Strain"_CAZY_secreted_headers.txt | wc -l
done
P.cactorum 11-40 641
P.cactorum 12420 644
P.cactorum 15_13 640
P.cactorum 15_7 607
P.cactorum 17-21 646
P.cactorum 2003_3 623
P.cactorum 4032 667
P.cactorum 4040 646
P.cactorum 404 800
P.cactorum 415 624
P.cactorum 416 628
P.cactorum 62471 652
P.cactorum P295 622
P.cactorum P421_v2 647
P.cactorum PC13_15 662
P.cactorum R36_14 652
P.idaei 371 598
P.idaei SCRP370 565
P.idaei SCRP376 588
Note - the CAZY genes identified may need further filtering based on e value and cuttoff length - see below:
Cols in yourfile.out.dm.ps:
- Family HMM
- HMM length
- Query ID
- Query length
- E-value (how similar to the family HMM)
- HMM start
- HMM end
- Query start
- Query end
- Coverage
-
For fungi, use E-value < 1e-17 and coverage > 0.45
-
The best threshold varies for different CAZyme classes (please see http://www.ncbi.nlm.nih.gov/pmc/articles/PMC4132414/ for details). Basically to annotate GH proteins, one should use a very relax coverage cutoff or the sensitivity will be low (Supplementary Tables S4 and S9); (ii) to annotate CE families a very stringent E-value cutoff and coverage cutoff should be used; otherwise the precision will be very low due to a very high false positive rate (Supplementary Tables S5 and S10)
The following commands were used to identify b-glucan binding proteins as described in: https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3627985/
Identifying PHIbase homologs
qlogin -pe smp 6
cd /projects/oldhome/groups/harrisonlab/project_files/idris
for QueryFasta in $(ls gene_pred/final_incl_ORF/*/*/final_genes_genes_incl_ORFeffectors_renamed.cds.fasta | grep -e 'P.cactorum' -e 'P.idaei' | grep -v -e '414' -e '10300'); do
Strain=$(echo $QueryFasta | rev | cut -f2 -d '/' | rev)
Organism=$(echo $QueryFasta | rev | cut -f3 -d '/' | rev)
echo "$Organism - $Strain"
OutDir=analysis/blast_homology/$Organism/$Strain
mkdir -p $OutDir
dbFasta=$(ls /projects/oldhome/groups/harrisonlab/phibase/v4.4/phi_accessions.fa)
dbType="prot"
Prefix="${Strain}_phi_accessions"
Eval="1e-30"
makeblastdb -in $dbFasta -input_type fasta -dbtype $dbType -title $Prefix.db -parse_seqids -out $OutDir/$Prefix.db
blastx -num_threads 6 -db $OutDir/$Prefix.db -query $QueryFasta -outfmt 6 -num_alignments 1 -out $OutDir/${Prefix}_hits.txt -evalue $Eval
cat $OutDir/${Prefix}_hits.txt | grep 'effector' | cut -f1,2 | sort | uniq > $OutDir/${Prefix}_hits_headers.txt
done > log.txt
qlogin
cd /projects/oldhome/groups/harrisonlab/project_files/idris
for Assembly in $(ls repeat_masked/P.*/*/filtered_contigs_repmask/*_contigs_unmasked.fa | grep -e 'P.cactorum' -e 'P.idaei' | grep -v -e '414' -e '10300'); do
Strain=$(echo $Assembly | rev | cut -f3 -d '/' | rev)
Organism=$(echo $Assembly | rev | cut -f4 -d '/' | rev)
echo "$Organism - $Strain"
OutDir=analysis/blast_homology/$Organism/$Strain/Avr_homologs
mkdir -p $OutDir
AvrFasta=$(ls analysis/blast_homology/oomycete_avr_genes/appended_oomycete_effectors_cds.fasta)
dbType="nucl"
CdsFasta=$(ls gene_pred/final_incl_ORF/$Organism/$Strain/final_genes_genes_incl_ORFeffectors_renamed.cds.fasta)
Eval="1e-30"
#-------------------------------------------------------
# Step 1. Blast cds vs avr gene database
#-------------------------------------------------------
Prefix="${Strain}_vs_appended_oomycete_effectors"
makeblastdb -in $AvrFasta -input_type fasta -dbtype nucl -title $Prefix.db -parse_seqids -out $OutDir/$Prefix.db
tblastx -db $OutDir/$Prefix.db -query $CdsFasta -outfmt 6 -num_alignments 1 -out $OutDir/${Prefix}_hits.txt -evalue $Eval
cat $OutDir/${Prefix}_hits.txt | cut -f1,2 | sort | uniq > $OutDir/${Prefix}_hits_headers.txt
#-------------------------------------------------------
# Step 2. Blast avr gene database vs cds
#-------------------------------------------------------
Prefix="appended_oomycete_effectors_vs_${Strain}"
makeblastdb -in $CdsFasta -input_type fasta -dbtype nucl -title $Prefix.db -parse_seqids -out $OutDir/$Prefix.db
tblastx -db $OutDir/$Prefix.db -query $AvrFasta -outfmt 6 -num_alignments 1 -out $OutDir/${Prefix}_hits.txt -evalue $Eval
cat $OutDir/${Prefix}_hits.txt | cut -f1,2 | sort | uniq > $OutDir/${Prefix}_hits_headers.txt
#-------------------------------------------------------
# Step 3. reciprocal hits
#-------------------------------------------------------
cat $OutDir/${Strain}_vs_appended_oomycete_effectors_hits_headers.txt > $OutDir/all_hits.txt
cat $OutDir/appended_oomycete_effectors_vs_${Strain}_hits_headers.txt | awk ' { t = $1; $1 = $2; $2 = t; print; } ' OFS=$'\t' >> $OutDir/all_hits.txt
cat $OutDir/all_hits.txt | sort | uniq -d > $OutDir/reciprocal_best_hits.txt
printf "Number of reciprocal blast pairs: "
cat $OutDir/reciprocal_best_hits.txt | wc -l
rm $OutDir/all_hits.txt
done
Extract genes for reciprocal hits:
cd /projects/oldhome/groups/harrisonlab/project_files/idris
for Assembly in $(ls repeat_masked/P.*/*/filtered_contigs_repmask/*_contigs_unmasked.fa | grep -e 'P.cactorum' -e 'P.idaei' | grep -v -e '414' -e '10300'); do
Strain=$(echo $Assembly | rev | cut -f3 -d '/' | rev)
Organism=$(echo $Assembly | rev | cut -f4 -d '/' | rev)
echo "$Organism - $Strain"
OutDir=analysis/blast_homology/$Organism/$Strain/Avr_homologs
mkdir -p $OutDir
CdsFasta=$(ls gene_pred/final_incl_ORF/$Organism/$Strain/final_genes_genes_incl_ORFeffectors_renamed.cds.fasta)
reciprocalHeaders=$(ls $OutDir/reciprocal_best_hits.txt)
cat $reciprocalHeaders | cut -f1 > $OutDir/reciprocal_best_hits_headers.txt
ProgDir=/projects/oldhome/armita/git_repos/emr_repos/tools/gene_prediction/ORF_finder
$ProgDir/extract_from_fasta.py --fasta $CdsFasta --headers $OutDir/reciprocal_best_hits_headers.txt > $OutDir/${Strain}_reciprocal_best_hits.fa
rm $OutDir/reciprocal_best_hits.fa
done
The Pc isoaltes ex apple carried Avr3a homologs
The orthogroup 19835 was identified as representing these genes. This was
extracted from the CDS orthogroup files:
ls analysis/orthology/orthomcl/Pcac_Pinf_publication/orthogroups_fasta_nuc/orthogroup19835.fa
Orthogroup522 was noted to contain reciprocal blast homologs to Avr3b
ls analysis/orthology/orthomcl/Pcac_Pinf_publication/orthogroups_fasta_nuc/orthogroup522.fa
The Pc isolates ex strawberry carried a gene that was an avr1d homolog. This turned out to be a closer homolog to Avh32 from P. sojae.
The orthogroup 19835 was identified as representing these genes. This was
extracted from the CDS orthogroup files:
cat /data/scratch/armita/idris/gene_pred/annotation/P.cactorum/414/414_annotation_ncbi.tsv | grep -e 'Avr1d'
ls analysis/orthology/orthomcl/Pcac_Pinf_publication/orthogroups_fasta_nuc/orthogroup14979.fa
The Avh32 alignment to P. sojae showed 30 nucleotide differences between the sequences leading to 22 amino acid differences between the proteins.
OrthoIDs=analysis/orthology/orthomcl/Pcac_Pinf_publication/Pcac_Pinf_publication_isolateIDs.txt
printf \
"
Pc_CR1 414
Pc_CR2 12420
Pc_CR3 15_13
Pc_CR4 15_7
Pc_CR5 2003_3
Pc_CR6 4032
Pc_CR7 4040
Pc_CR8 404
Pc_CR9 415
Pc_CR10 416
Pc_CR11 P421_v2
Pc_CR12 PC13_15
Pc_CR13 10300
Pc_LR1 11-40
Pc_LR2 17-21
Pc_MD1 62471
Pc_MD2 P295
Pc_MD3 R36_14
Pi_RI1 371
Pi_RI2 SCRP370
Pi_RI3 SCRP376
" \
> $OrthoIDs
for GeneGff in $(ls gene_pred/final_incl_ORF/*/*/final_genes_genes_incl_ORFeffectors_renamed.gff3 | grep -e 'P.cactorum' -e 'P.idaei' | grep -v -e '414' -e '10300' | grep -v -e '415' -e '416'); do
Strain=$(echo $GeneGff | rev | cut -f2 -d '/' | rev)
Organism=$(echo $GeneGff | rev | cut -f3 -d '/' | rev)
OutDir=gene_pred/annotation/$Organism/$Strain
mkdir -p $OutDir
# combine sigP2 headers
cat gene_pred/final_sigP/$Organism/$Strain/${Strain}_aug_sp.aa | grep '>' | cut -f1 | tr -d '>' | tr -d ' ' > $OutDir/sigP2_headers_combined.txt
cat gene_pred/ORF_sigP/$Organism/$Strain/${Strain}_aug_sp.aa | grep '>' | cut -f1 | tr -d '>' | tr -d ' ' >> $OutDir/sigP2_headers_combined.txt
# cat gene_pred/ORF_sigP/$Organism/$Strain/${Strain}_ORF_sp.aa | grep '>' | cut -f1 | tr -d '>' | tr -d ' ' >> $OutDir/sigP2_headers_combined.txt
# cat gene_pred/ORF_sigP/$Organism/${Strain}_old/${Strain}_aug_sp.aa | grep '>' | cut -f1 | tr -d '>' | tr -d ' ' >> $OutDir/sigP2_headers_combined.txt
# combine sigP3 headers
cat gene_pred/final_signalp-3.0/$Organism/$Strain/${Strain}_aug_sp.aa | grep '>' | cut -f1 | tr -d '>' | tr -d ' ' > $OutDir/sigP3_headers_combined.txt
# cat gene_pred/ORF_signalp-3.0/$Organism/$Strain/${Strain}_aug_sp.aa | grep '>' | cut -f1 | tr -d '>' | tr -d ' ' >> $OutDir/sigP3_headers_combined.txt
# combine sigP4 headers
cat gene_pred/final_signalp-4.1/$Organism/$Strain/${Strain}_aug_sp.aa | grep '>' | cut -f1 | tr -d '>' | tr -d ' ' > $OutDir/sigP4_headers_combined.txt
cat gene_pred/ORF_signalp-4.1/$Organism/$Strain/${Strain}_aug_sp.aa | grep '>' | cut -f1 | tr -d '>' | tr -d ' ' >> $OutDir/sigP4_headers_combined.txt
cat analysis/phobius/$Organism/$Strain/${Strain}_phobius_headers.txt analysis/phobius/$Organism/$Strain/${Strain}_phobius_headers_ORF.txt | sed "s/^g_/contig_/g" > $OutDir/phobius_headers_combined.txt
cat analysis/RxLR_effectors/RxLR_EER_regex_finder/$Organism/$Strain/${Strain}_RxLR_EER_regex.fa analysis/RxLR_effectors/RxLR_EER_regex_finder/$Organism/$Strain/${Strain}_ORF_RxLR_EER_regex_merged.aa > $OutDir/${Strain}_combined_RxLR_EER_regex.fa
cat analysis/RxLR_effectors/hmmer_RxLR/$Organism/$Strain/${Strain}_RxLR_hmmer.fa analysis/RxLR_effectors/hmmer_RxLR/$Organism/$Strain/${Strain}_ORF_RxLR_hmmer.fa > $OutDir/${Strain}_combined_RxLR_hmmer.fa
cat analysis/RxLR_effectors/hmmer_WY/$Organism/$Strain/${Strain}_Aug_WY_hmmer.fa analysis/RxLR_effectors/hmmer_WY/$Organism/$Strain/${Strain}_ORF_WY_hmmer.fa > $OutDir/${Strain}_combined_WY_hmmer.fa
cat analysis/CRN_effectors/hmmer_CRN/$Organism/$Strain/${Strain}_ORFsUniq_CRN_hmmer.bed \
| grep -w 'transcript' | cut -f5 -d '=' | cut -f1 -d ';' \
> analysis/CRN_effectors/hmmer_CRN/$Organism/$Strain/${Strain}_ORFsUniq_CRN_hmmer.txt
cat analysis/CRN_effectors/hmmer_CRN/$Organism/$Strain/${Strain}_ORF_CRN_LFLAK_unmerged_hmmer.fa \
| grep -A1 -f analysis/CRN_effectors/hmmer_CRN/$Organism/$Strain/${Strain}_ORFsUniq_CRN_hmmer.txt \
| grep -v '\-\-' \
> analysis/CRN_effectors/hmmer_CRN/$Organism/$Strain/${Strain}_ORFsUniq_CRN_hmmer.fa
cat analysis/CRN_effectors/hmmer_CRN/$Organism/$Strain/${Strain}_pub_CRN_LFLAK_hmm.fa analysis/CRN_effectors/hmmer_CRN/$Organism/$Strain/${Strain}_ORFsUniq_CRN_hmmer.fa > $OutDir/${Strain}_combined_CRN_LFLAK_hmm.fa
cat analysis/CRN_effectors/hmmer_CRN/$Organism/$Strain/${Strain}_pub_CRN_DWL_hmm.fa analysis/CRN_effectors/hmmer_CRN/$Organism/$Strain/${Strain}_ORF_CRN_DWL_unmerged_hmmer.fa > $OutDir/${Strain}_combined_CRN_DWL_hmm.fa
Fasta=$(ls gene_pred/final_incl_ORF/$Organism/$Strain/final_genes_genes_incl_ORFeffectors_renamed.pep.fasta)
OrfGff=$(ls gene_pred/ORF_finder/$Organism/$Strain/${Strain}_ORF.gff3)
GeneConversion=$(ls gene_pred/final_incl_ORF/$Organism/$Strain/final_genes_appended_renamed.log)
TFs=$(ls analysis/transcription_factors/$Organism/$Strain/"$Strain"_TF_domains.tsv)
SigP2=$(ls $OutDir/sigP2_headers_combined.txt)
SigP3=$(ls $OutDir/sigP3_headers_combined.txt)
SigP4=$(ls $OutDir/sigP4_headers_combined.txt)
Phobius=$(ls $OutDir/phobius_headers_combined.txt)
TM_out=$(ls gene_pred/trans_mem/$Organism/$Strain/"$Strain"_TM_genes_pos.txt)
# GPI_out=$(ls gene_pred/trans_mem/$Organism/$Strain/GPIsom/GPI_pos.fa)
RxLR_Motif=$(ls analysis/RxLR_effectors/RxLR_EER_regex_finder/$Organism/$Strain/${Strain}_all_secreted_RxLR_regex.fa)
RxLR_Hmm=$(ls $OutDir/${Strain}_combined_RxLR_hmmer.fa)
RxLR_WY=$(ls $OutDir/${Strain}_combined_WY_hmmer.fa)
CRN_LFLAK=$(ls $OutDir/${Strain}_combined_CRN_LFLAK_hmm.fa)
CRN_DWL=$(ls $OutDir/${Strain}_combined_CRN_DWL_hmm.fa)
EffP_list=$(ls analysis/effectorP/$Organism/$Strain/"$Organism"_"$Strain"_EffectorP_headers.txt)
CAZY_list=$(ls gene_pred/CAZY/$Organism/$Strain/"$Strain"_CAZY.out.dm.ps)
PhiHits=$(ls analysis/blast_homology/$Organism/$Strain/"$Strain"_phi_accessions_hits_headers.txt)
InterPro=$(ls gene_pred/interproscan/$Organism/$Strain/*_interproscan.tsv)
SwissProt=$(ls gene_pred/swissprot/$Organism/$Strain/swissprot_vMar2018_tophit_parsed.tbl)
Orthology=$(ls /projects/oldhome/groups/harrisonlab/project_files/idris/analysis/orthology/orthomcl/Pcac_Pinf_publication/Pcac_Pinf_publication_orthogroups.txt)
OrthoIDs=$(ls analysis/orthology/orthomcl/Pcac_Pinf_publication/Pcac_Pinf_publication_isolateIDs.txt)
OrthoStrainID=$(cat $OrthoIDs | grep -w "$Strain" | cut -f1 -d ' ')
echo "$Strain - $OrthoStrainID"
OrthoStrainAll='Pc_CR1 Pc_CR2 Pc_CR3 Pc_CR4 Pc_CR5 Pc_CR6 Pc_CR7 Pc_CR8 Pc_CR9 Pc_CR10 Pc_CR11 Pc_CR12 Pc_CR13 Pc_LR1 Pc_LR2 Pc_MD1 Pc_MD2 Pc_MD3 Pi_RI1 Pi_RI2 Pi_RI3'
ProgDir=/projects/oldhome/armita/git_repos/emr_repos/scripts/phytophthora/gene_annotation
python $ProgDir/illumina_annotation_table.py \
--protein_fasta $Fasta \
--conversion_log $GeneConversion \
--ORF_gff $OrfGff \
--genes_gff $GeneGff \
--SigP2 $SigP2 \
--SigP3 $SigP3 \
--SigP4 $SigP4 \
--phobius $Phobius \
--TM_list $TM_out \
--RxLR_regex $RxLR_Motif \
--RxLR_hmm $RxLR_Hmm \
--CRN_dwl $CRN_DWL \
--CRN_lflak $CRN_LFLAK \
--EffP_list $EffP_list \
--PhiHits $PhiHits \
--CAZY $CAZY_list \
--TFs $TFs \
--InterPro $InterPro \
--Swissprot $SwissProt \
--orthogroups $Orthology \
--strain_id $OrthoStrainID \
--OrthoMCL_all $OrthoStrainAll \
> $OutDir/"$Strain"_annotation_stramenopiles.tsv
# > $OutDir/"$Strain"_annotation_ncbi2.tsv
done
Analysing annotation tables
cat $OutDir/"$Strain"_annotation_ncbi.tsv | cut -f25 | grep -v "^$" | grep -v 'ipr_effectors' | sort | uniq -c | sort -nr
for File in $(ls gene_pred/annotation/*/*/*_annotation_ncbi.tsv | grep -e 'P.cactorum' -e 'P.idaei' | grep -v -e '414' -e '10300'); do
Strain=$(echo $File | rev | cut -f2 -d '/' | rev)
Organism=$(echo $File | rev | cut -f3 -d '/' | rev)
GeneNum=$(cat $File | cut -f1 | tail -n+2 | cut -f1 -d '.' | uniq | wc -l)
ProtNum=$(cat $File | cut -f1 | tail -n+2 | uniq | wc -l)
Secreted=$(cat $File | cut -f11 | tail -n+2 | grep 'Yes' | wc -l)
EffP=$(cat $File | cut -f11,18 | tail -n+2 | grep "Yes.Yes" | wc -l)
Cazy=$(cat $File | cut -f11,19 | tail -n+2 | grep "Yes.CAZY"| wc -l)
RxLR=$(cat $File | cut -f14 | tail -n+2 | grep "RxLR" |wc -l)
CRN=$(cat $File | cut -f17 | tail -n+2 | grep "CRN" | wc -l)
TFs=$(cat $File | cut -f20 | tail -n+2 | grep -v "^$" | wc -l)
Elicitin=$(cat $File | cut -f11,25 | tail -n+2 | grep 'Yes' | grep "MAMP:Elicitin" | wc -l)
Transglutaminase=$(cat $File | cut -f11,25 | tail -n+2 | grep 'Yes' | grep "MAMP:Transglutaminase" | wc -l)
NLP=$(cat $File | cut -f11,25 | tail -n+2 | grep 'Yes' | grep "Apoplastic:NLP" | wc -l)
Kazal=$(cat $File | cut -f11,25 | tail -n+2 | grep 'Yes' | grep "Apoplastic:Protease inhibitor (Kazal-type)" | wc -l)
Cathepsin=$(cat $File | cut -f11,25 | tail -n+2 | grep 'Yes' | grep "Apoplastic:Protease inhibitor (cathepsin)" | wc -l)
Cystatin=$(cat $File | cut -f11,25 | tail -n+2 | grep 'Yes' | grep "Apoplastic:Protease inhibitor (cystatin-like)" | wc -l)
GlucanaseInhibitor=$(cat $File | cut -f11,25 | grep 'Yes' | tail -n+2 | grep "Apoplastic:Glucanase inhibitor" | wc -l)
Phytotoxin=$(cat $File | cut -f11,25 | tail -n+2 | grep 'Yes' | grep "Apoplastic:Phytotoxin" | wc -l)
Cutinase=$(cat $File | cut -f11,25 | tail -n+2 | grep 'Yes' | grep "Apoplastic:Cutinase" | wc -l)
printf "$Organism\t$Strain\t$GeneNum\t$ProtNum\t$Secreted\t$EffP\t$Cazy\t$RxLR\t$CRN"
printf "\t$TFs\t$Elicitin\t$Transglutaminase\t$NLP\t$Kazal\t$Cathepsin\t$Cystatin\t$GlucanaseInhibitor\t$Phytotoxin\t$Cutinase\n"
done
Apple expansion
AnnotTab=$(ls gene_pred/annotation/P.cactorum/62471/62471_annotation_ncbi2.tsv)
OutDir=gene_pred/annotation/P.cactorum/62471
Strain=62471
cat $AnnotTab | grep -w -e 'apple expanded' -e 'crown rot loss' | cut -f21 | sort | uniq | wc -l
cat $AnnotTab | grep -w -e 'apple expanded' -e 'crown rot loss' | wc -l
cat $AnnotTab | grep -w -e 'apple expanded'| cut -f21 | sort | uniq | wc -l
cat $AnnotTab | grep -w -e 'apple expanded' | wc -l
cat $AnnotTab | grep -w -e 'crown rot loss' | cut -f21 | sort | uniq | wc -l
cat $AnnotTab | grep -w -e 'crown rot loss' | wc -l
cat $AnnotTab | grep -w -e 'apple expanded' > $OutDir/"$Strain"_annotation_ncbi_md_expanded.tsv
cat $OutDir/"$Strain"_annotation_ncbi_md_expanded.tsv | grep 'RxLR' | wc -l
cat $OutDir/"$Strain"_annotation_ncbi_md_expanded.tsv | cut -f28 | wc -l
cat $AnnotTab | grep -w -e 'crown rot loss' > $OutDir/"$Strain"_annotation_ncbi_cr_loss.tsv
cat $OutDir/"$Strain"_annotation_ncbi_cr_loss.tsv | grep 'RxLR' | wc -l