-
Notifications
You must be signed in to change notification settings - Fork 2
/
mauve_alignments
101 lines (48 loc) · 4.3 KB
/
mauve_alignments
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
PIPELINE for Mauve whole genome alignments and subsequent CLONALFRAME analysis of regions of recombination
Take all strains belonging to each clade including cherry strains and others.
order genomes with contigs with mauve against pacbio/complete genome
align with progressive-mauve
extract core with stripSubsetLCBs - extract only the LCBs (homologous blocks) that are present in all strains
Convert to fasta and concatenate the alignment (all LCB alignments concantenated together)
Convert to phylip format
Build tree with RaxMLL (with GTR model, 100 bootstraps)
Use this tree and xmfa alignment output for ClonalFrameML
#CORE AND ACCESSORY GENOMES for each STRAIN
#Concatenate contigs in fasta file into 1 contig
#Extract coordinates of core genome LCBs for strain from core.xmfa and store in txt file
#Use coordinates to pull out core genome regions from fasta to either get core genome or remove to get accessory genome
..........
# Use move_contigs to order genomes based on reference for each phylogroup (Use ssh -Y cluster to ensure trusted X11 connection)
for GENOME in /home/hulinm/pseudomonas_data/pseudomonas/analysis/mauve/pss/genomes/*.fasta ; do
GENOME_SHORT=$(basename $GENOME | sed s/".fasta"//g )
echo $GENOME_SHORT
mkdir "$GENOME_SHORT"_dir
REFERENCE=/home/hulinm/pseudomonas_data/pseudomonas/analysis/mauve/pss/genomes/ref/Pss_B728a.fasta
echo $REFERENCE
java -Xmx500m -cp Mauve.jar org.gel.mauve.contigs.ContigOrderer -output /home/hulinm/pseudomonas_data/pseudomonas/analysis/mauve/pss/"$GENOME_SHORT"_dir -ref $REFERENCE -draft $GENOME
done
##copy all fasta to align directory
# Generate alignment of genome sequences using progressive mauve - remember to use linux-64
GENOMES=$(ls /home/hulinm/pseudomonas_data/pseudomonas/analysis/mauve/pss/align/removerogue/*.fasta)
/home/hulinm/local/src/mauve_snapshot_2015-02-13/linux-x64/progressiveMauve --output=/home/hulinm/pseudomonas_data/pseudomonas/analysis/mauve/pss/align/removerogue/pss.xmfa $GENOMES
# Strip alignment of non-core parts to create core genome alignment. Minimum length of LCB is user defined as 1000
/home/hulinm/local/src/mauve_2.3.1/stripSubsetLCBs /home/hulinm/pseudomonas_data/pseudomonas/analysis/mauve/race1/new/align/race1.xmfa /home/hulinm/pseudomonas_data/pseudomonas/analysis/mauve/race1/new/align/race1.xmfa.bbcols /home/hulinm/pseudomonas_data/pseudomonas/analysis/mauve/race1/new/align/race1_core.xmfa 1000
# Replace file path with nothing so identifier for each strain is short version
sed -e 's/\/home\/hulinm\/pseudomonas_data\/pseudomonas\/analysis\/mauve\/race1\/new\/align\///g' /home/hulinm/pseudomonas_data/pseudomonas/analysis/mauve/race1/new/align/race1_core.xmfa > /home/hulinm/pseudomonas_data/pseudomonas/analysis/mauve/race1/new/align/race1_core_align.xmfa
sed s/".fasta"//g /home/hulinm/pseudomonas_data/pseudomonas/analysis/mauve/race1/new/align/race1_core_align.xmfa > /home/hulinm/pseudomonas_data/pseudomonas/analysis/mauve/race1/new/align/race1_core.xmfa
# Convert xmfa to concatenated fasta (all LCBs concatenated)
perl /home/hulinm/git_repos/tools/analysis/python_effector_scripts/alignment_convert.pl -i race1_core.xmfa -o race1.fasta -f fasta -c
# Use Gblocks to strip alignment of ambiguous seq
# Convert fasta to phylip format
perl /home/hulinm/git_repos/tools/analysis/python_effector_scripts/alignment_convert.pl -i race1.fasta -o race1.phy -f phylip -g fasta
qsub /home/hulinm/git_repos/pseudomonas/orthomcl/sub_raxml.sh race1.phy
### CLONALFRAME
#Remove additional information in headers to make compatible with ClonalFrameML. Then remove blank lines
python /home/hulinm/git_repos/tools/analysis/python_effector_scripts/edit_xmfa.py race1_core.xmfa > race1_core
sed -i '/^$/d' race1_core
#### make sure names are OK - same in xmfa and tree ####
sed 's/Ps_myricae/Ps_myricaeAZ8448/g' RAxML_bestTree.out | sed 's/9657\/1-486/9657/g' | sed 's/9629\/1-486/9629/g' | sed 's/5300\/1-486/5300/g' | sed 's/Ps_rhaphio/Ps_rhaphiolepidis/g' | sed 's/9646\/1-486/9646/g' | sed 's/5269\/1-486/5269/g' | sed 's/5244\/1-486/5244/g' | sed 's/2341\/1-486/2341/g' > tree
# ClonalFrameML. Importation status gives you likely regions of recombinant origin
qsub /home/hulinm/git_repos/pseudomonas/orthomcl/sub_clonalframeML.sh tree2 race1_core R1
# Create graph
Rscript /home/hulinm/local/src/ClonalFrameML/src/cfml_results.R R1