A copy of these data can also be obtained from: https://github.com/eead-csic-compbio/get_homologues/releases/tag/v1.1 ## software GET_HOMOLOGUES: version 09052018 (https://github.com/eead-csic-compbio/get_homologues) ## sequence files and lists The original files used in this work are available in folder cds/ Please see readme files therein to know the source of sequence files. The original Brachypodium sequences were obtained from Phytozome. Current up-to-date files can be downloaded from https://genome.jgi.doe.gov/portal/Phytozome/Phytozome.download.html ## 1) pan-genome sequence clustering ################################################# perl get_homologues-est.pl -d cds -S 1 -I cds/list.Bdis.Bsta.subgenomesDS \ -m cluster -i 0 -M -A -t 0 &> log.Bdis.Bsta.subgenomesDS # NJ tree splits cleanly both S & D genomes, likely annotation bias separates Bdistachyon3 and Bd21Control plot_matrix_heatmap.sh -o pdf -d 1 -H 20 -W 30 -a 45 -b 12 -X 0.9 -N \ -i cds_est_homologues/BdistachyonBd213v1_0taxa_list.Bdis.Bsta.subgenomesDS_algOMCL_e0_S1_Avg_identity.tab &> log.heat # outfile: BdistachyonBd213v1_0taxa_list.Bdis.Bsta.subgenomesDS_algOMCL_e0_S1_Avg_identity_heatmap.pdf # now with Bd21 gene names perl get_homologues-est.pl -d cds -S 1 -I cds/list.Bdis.Bsta.subgenomesDS \ -m cluster -i 0 -M -t 0 -r Bdistachyonv3.1 &> log.Bdis.Bsta.subgenomesDS.ref perl get_homologues-est.pl -d cds -S 1 -I cds/list.Bdis.Bsta.subgenomesDS \ -m cluster -i 0 -M -A -r Bdistachyonv3.1 &> log.Bdis.Bsta.subgenomesDS.ref.core # NJ tree splits cleanly both S & D genomes; with core genes Bdistachyon3 and Bd21Control group together plot_matrix_heatmap.sh -o pdf -d 1 -H 20 -W 30 -a 45 -b 12 -X 0.9 -N \ -i cds_est_homologues/Bdistachyonv3_alltaxa_list.Bdis.Bsta.subgenomesDS_algOMCL_e0_S1_Avg_identity.tab &> log.heat.core # outfile: Bdistachyonv3_alltaxa_list.Bdis.Bsta.subgenomesDS_algOMCL_e0_S1_Avg_identity_heatmap.pdf # get pangenome matrix (PGM) with all clusters (nucl & prot) # all clusters perl compare_clusters.pl -d cds_est_homologues/Bdistachyonv3_0taxa_list.Bdis.Bsta.subgenomesDS_algOMCL_e0_S1_ \ -o clusters.Bdis.Bsta.subgenomesDS -m -n &> log.clusters perl compare_clusters.pl -d cds_est_homologues/Bdistachyonv3_0taxa_list.Bdis.Bsta.subgenomesDS_algOMCL_e0_S1_ \ -o clusters.Bdis.Bsta.subgenomesDS -m &> log.clusters perl -F'\t' -ane '$r++;for(1 .. @F){$m[$r][$_]=$F[$_-1]};$mx=@F;END{for(1 .. $mx){for $t(1 .. $r){print"$m[$t][$_]\t"}print"\n"}}' \ clusters.Bdis.Bsta.subgenomesDS/pangenome_matrix_t0.tab > clusters.Bdis.Bsta.subgenomesDS/pangenome_matrix_t0.tr.tab perl -F'\t' -ane '$r++;for(1 .. @F){$m[$r][$_]=$F[$_-1]};$mx=@F;END{for(1 .. $mx){for $t(1 .. $r){print"$m[$t][$_]\t"}print"\n"}}' \ clusters.Bdis.Bsta.subgenomesDS/pangenome_matrix_genes_t0.tab > clusters.Bdis.Bsta.subgenomesDS/pangenome_matrix_genes_t0.tr.tab # PGM-based ML phylogeny cd clusters.Bdis.Bsta.subgenomesDS get_phylomarkers/estimate_pangenome_phylogenies.sh -f pangenome_matrix_t0.fasta -r 10 # outtree: best_PGM_IQT_abayes_UFBboot_run2_GTR2+FO+R3.treefile.pdf # single-copy clusters with occup > 10 perl compare_clusters.pl -d cds_est_homologues/Bdistachyonv3_0taxa_list.Bdis.Bsta.subgenomesDS_algOMCL_e0_S1_ \ -o clusters.Bdis.Bsta.subgenomesDS.t10 -m -t 10 &> log.clusters.t10 # PGM-based ML phylogeny cd clusters.Bdis.Bsta.subgenomesDS.t10 get_phylomarkers/estimate_pangenome_phylogenies.sh -f pangenome_matrix_t10.fasta -r 10 # outtree: best_PGM_IQT_abayes_UFBboot_run2_GTR2+FO+R4.treefile.pdf # estimate core & shell size # Bdis perl parse_pangenome_matrix.pl \ -m clusters.Bdis.Bsta.subgenomesDS/pangenome_matrix_t0.tab -s -I cds/list.Bdis.pgm &> log.pgm.Bdis # matrix contains 74658 clusters and 52 taxa # cloud size: 21878 list: clusters.Bdis.Bsta.subgenomesDS/pangenome_matrix_t0__list.Bdis.pgm_cloud_list.txt # shell size: 16891 list: clusters.Bdis.Bsta.subgenomesDS/pangenome_matrix_t0__list.Bdis.pgm_shell_list.txt # soft core size: 21537 list: clusters.Bdis.Bsta.subgenomesDS/pangenome_matrix_t0__list.Bdis.pgm_softcore_list.txt # core size: 16746 (included in soft core) list: clusters.Bdis.Bsta.subgenomesDS/pangenome_matrix_t0__list.Bdis.pgm_core_list.txt # Bdis + subgenomeD perl parse_pangenome_matrix.pl \ -m clusters.Bdis.Bsta.subgenomesDS/pangenome_matrix_t0.tab -s -I cds/list.Bdis.subgenomesD.pgm &> log.pgm.Bdis.subgenomesD # matrix contains 74658 clusters and 56 taxa # cloud size: 24123 list: clusters.Bdis.Bsta.subgenomesDS/pangenome_matrix_t0__list.Bdis.subgenomesD.pgm_cloud_list.txt # shell size: 19400 list: clusters.Bdis.Bsta.subgenomesDS/pangenome_matrix_t0__list.Bdis.subgenomesD.pgm_shell_list.txt # soft core size: 21211 list: clusters.Bdis.Bsta.subgenomesDS/pangenome_matrix_t0__list.Bdis.subgenomesD.pgm_softcore_list.txt # core size: 15924 (included in soft core) list: clusters.Bdis.Bsta.subgenomesDS/pangenome_matrix_t0__list.Bdis.subgenomesD.pgm_core_list.txt # Bdis + Bsta perl parse_pangenome_matrix.pl \ -m clusters.Bdis.Bsta.subgenomesDS/pangenome_matrix_t0.tab -s -I cds/list.Bdis.Bsta.pgm &> log.pgm.Bdis.Bsta # matrix contains 74658 clusters and 53 taxa # cloud size: 26000 list: clusters.Bdis.Bsta.subgenomesDS/pangenome_matrix_t0__list.Bdis.Bsta.pgm_cloud_list.txt # shell size: 17402 list: clusters.Bdis.Bsta.subgenomesDS/pangenome_matrix_t0__list.Bdis.Bsta.pgm_shell_list.txt # soft core size: 21217 list: clusters.Bdis.Bsta.subgenomesDS/pangenome_matrix_t0__list.Bdis.Bsta.pgm_softcore_list.txt # core size: 16122 (included in soft core) list: clusters.Bdis.Bsta.subgenomesDS/pangenome_matrix_t0__list.Bdis.Bsta.pgm_core_list.txt # all perl parse_pangenome_matrix.pl \ -m clusters.Bdis.Bsta.subgenomesDS/pangenome_matrix_t0.tab -s &> log.pgm.all # matrix contains 74658 clusters and 61 taxa # cloud size: 29306 list: clusters.Bdis.Bsta.subgenomesDS/pangenome_matrix_t0__cloud_list.txt # shell size: 25265 list: clusters.Bdis.Bsta.subgenomesDS/pangenome_matrix_t0__shell_list.txt # soft core size: 20087 list: clusters.Bdis.Bsta.subgenomesDS/pangenome_matrix_t0__softcore_list.txt # core size: 14134 (included in soft core) list: clusters.Bdis.Bsta.subgenomesDS/pangenome_matrix_t0__core_list.txt ## 2) dNdS analysis #################################################################### # single-copy clusters of t>=6 including Bhyb118-5 perl get_homologues-est.pl -d cds/ -m cluster -i 0 -M -S 1 -e -t 6 \ -I cds/list.Bdis.Bsta.Bhyb118.subgenomesDS -r Bdistachyonv3.1 &> log.Bdis.Bsta.Bhyb118.subgenomesDS.OMCL.ref # number_of_clusters = 24459 # cluster_list = cds_est_homologues/Bdistachyonv3_6taxa_list.Bdis.Bsta.Bhyb118.subgenomesDS_algOMCL_e1_S1_.cluster_list # cluster_directory = cds_est_homologues/Bdistachyonv3_6taxa_list.Bdis.Bsta.Bhyb118.subgenomesDS_algOMCL_e1_S1_ mkdir dNdS mkdir dNdS/align dNdS/kaks cp -r cds_est_homologues/Bdistachyonv3_6taxa_list.Bdis.Bsta.Bhyb118.subgenomesDS_algOMCL_e1_S1_ dNdS/pep_t6 cp -r cds_est_homologues/Bdistachyonv3_6taxa_list.Bdis.Bsta.Bhyb118.subgenomesDS_algOMCL_e1_S1_ dNdS/nucl_t6 rm -f dNdS/pep_t6/*fna rm -f dNdS/nucl_t6/*faa # produce cds codon alignments by copying protein-based alignments (selva) # NOTE: this was done with script run_Fasta4phylogenomics.pl , see ../suppl_scripts, with multiple binary dependencies # NOTE2: today this would be done using GET_PHYLOMARKERS (https://github.com/vinuesa/get_phylomarkers) perl run_Fasta4phylogenomics.pl -d dNdS/nucl_t6/ -p dNdS/pep_t6/ -w dNdS/align/ -a -m pbs # calculate kaks converting 99.00 omega values to 0.00 perl _calc_kaks.pl dNdS/align/ dNdS/kaks/ > dNdS.kaks.tab cd dNdS perl _annotate_alignments_dNdS.pl list.subgenomeD > subgenomeD.kaks.tab perl _annotate_alignments_dNdS.pl list.subgenomeS > subgenomeS.kaks.tab cd ..