lastal – 多基因组比对
-1.ref
https://github.com/gotouerina/Comparative-genomics-script/wiki/03.LASTAL-Alignment-Pipeline
https://gitlab.com/mcfrith/last
https://gitlab.com/mcfrith/last/-/blob/main/doc/last-papers.rst
0.开始之前
最好对用到的基因组的序列名进行重新命名,举几个例子:
Gallus_gallus.chr1
Aegithalos_caudatus.scaffold_8
Zosterops_lateralis.scaffold_19
这个步骤不再赘述
软件的安装不再赘述
1.对参考基因组建库, 参数可以照抄
lastdb -P8 -uMAM8 ref/chicken Gallus_gallus.fa
2.为基因组比对寻找合适的参数(对你的每个物种重复这个过程)
last-train -P8 --revsym -C2 ref/chicken Zosterops_lateralis.fa > train/Zosterops_lateralis.train
3.进行基因组比对(对你的每个物种重复这个过程)
lastal -P5 -p train/Zosterops_lateralis.train ref/chicken Zosterops_lateralis.fa | last-split -fMAF+ > maf/Zosterops_lateralis.maf
maf-swap maf/Zosterops_lateralis.maf | last-split | maf-swap | maf-sort > maf/Zosterops_lateralis.sort.maf
这里其实就比对结束,为了后续使用multiz,还需要对表头的注释行进行修改
grep -v '^#' maf/Zosterops_lateralis.sort.maf | awk 'BEGIN{print ##maf version=1}{print}' > maf/Gallus_gallus.Zosterops_lateralis.sing.maf
对表头的注释行进行修改
Gallus_gallus.Zosterops_lateralis.sing.maf就是后续multiz需要用到的文件
命名格式是:
ref基因组.query基因组.sing.maf
4.合并多基因组比对(自行安装Multiz)
./roast - T=/path/to/workdir E=Gallus_gallus "((Zosterops_lateralis sppA) Gallus_gallus)" Gallus_gallus.*.sing.maf ref_species_mulitway.maf > roast.sh
Gallus_gallus.*.sing.maf指定了所有的输入文件
#ref_species_mulitway.maf是最后的输出文件
#"((Zosterops_lateralis sppA) Gallus_gallus)"是你所有物种的系统发生关系,
#和传统的nwk文件类似,但是去除了所有枝长,并且把逗号变为空格
sh roast.sh
#运行完毕即可
#输出ref_species_mulitway.maf,就是最后的多基因组比对maf文件
5.Maf.to.gene
原始的脚本来自文章"10.1126/science.aav6202"
使用GPT进行了修改,主要添加了帮助文档和更详细的参数说明
cat 01.convertMaf2List.pl
perl 01.convertMaf2List.pl 查看帮助说明
#!/usr/bin/env perl
use strict;
use warnings;
use Getopt::Long;
# 定义变量
my ($maf_file, $output_file, $ref_species, $query_input, $tmp_file, $help);
# 解析参数
GetOptions(
"i=s" => /$maf_file,
"o=s" => /$output_file,
"ref=s" => /$ref_species,
"query=s" => /$query_input,
"tmp=s" => /$tmp_file,
"h|help" => /$help,
) or die "Error in command line arguments/n";
# 如果什么参数都没给,或者显式要求帮助
if (!$maf_file && !$output_file && !$ref_species && !$query_input && !$tmp_file || $help) {
print_help();
exit;
}
# 检查必需参数
die "Missing -i input file/n" unless $maf_file;
die "Missing -o output file/n" unless $output_file;
die "Missing -ref reference species/n" unless $ref_species;
die "Missing -query query species/n" unless $query_input;
# 设定默认的tmp输出
$tmp_file ||= "no_ref.maf";
# 处理query输入(逗号分割 或 文件)
my @species;
if (-e $query_input) {
open my $fh, "<", $query_input or die "Cannot open $query_input/n";
while (<$fh>) {
chomp;
next if /^$/;
push @species, $_;
}
close $fh;
} else {
@species = split(/,/, $query_input);
}
# 打开输出文件
open my $O, ">", $output_file or die "Cannot open $output_file/n";
open my $E, ">", $tmp_file or die "Cannot open $tmp_file/n";
# 打印表头
my @head = ("chr", "pos", @species);
print $O join("/t", @head), "/n";
# 读取maf文件
open my $I, "-|", "cat $maf_file" or die "Cannot open $maf_file/n";
my $content = "";
while (<$I>) {
next if(/^#/);
if (/^a/s+score=/) {
$content = $_;
while (<$I>) {
if (/^s/s+/) {
$content .= $_;
} else {
last;
}
}
analysis($content, $O, $E, $ref_species, /@species);
}
}
close $I;
close $O;
close $E;
# 子程序:解析一段alignment block
sub analysis {
my ($content, $O, $E, $ref, $species_ref) = @_;
my @line = split(//n/, $content);
shift @line; # 删除a行
my %pos;
my $isRefFound = 0;
my $ref_chr = "NA";
foreach my $line (@line) {
my @a = split(//s+/, $line);
my ($s, $chr, $start, $alignment_length, $strand, $sequence_length, $sequence) = @a;
$chr =~ /^([^/.]+)/.(.*)/;
my $species = $1;
$chr = $2;
if ($species eq $ref) {
$ref_chr = $chr;
$isRefFound = 1;
my @base = split(//, $sequence);
if ($strand eq "+") {
my $pos = $start;
for (my $i = 0; $i < @base; $i++) {
if ($base[$i] ne "-") {
$pos++;
$pos{$i} = $pos;
}
}
} elsif ($strand eq "-") {
my $pos = $start;
for (my $i = 0; $i < @base; $i++) {
if ($base[$i] ne "-") {
$pos++;
my $real_pos = $sequence_length - 1 - ($pos - 1) + 1;
$pos{$i} = $real_pos;
}
}
}
}
}
if ($isRefFound == 0) {
print $E $content;
return;
}
my %data;
foreach my $line (@line) {
my @a = split(//s+/, $line);
my ($s, $chr, $start, $alignment_length, $strand, $sequence_length, $sequence) = @a;
$chr =~ /^([^/.]+)/.(.*)/;
my $species = $1;
$chr = $2;
my @base = split(//, $sequence);
$sequence =~ tr/ATCGRYMK/TAGCYRKM/ if ($strand eq "-");
for (my $i = 0; $i < @base; $i++) {
next unless exists $pos{$i};
my $pos = $pos{$i};
$data{$pos}{$species} = $base[$i];
}
}
foreach my $pos (sort { $a <=> $b } keys %data) {
my @output_line = ($ref_chr, $pos);
foreach my $species (@$species_ref) {
my $base = "-";
$base = $data{$pos}{$species} if exists $data{$pos}{$species};
push @output_line, $base;
}
print $O join("/t", @output_line), "/n";
}
}
# 打印帮助信息
sub print_help {
print < -o -ref -query [-tmp ]
Parameters:
-i Input MAF file (required)
-o Output LST file (required)
-ref Reference species name (required)
-query Query species names (comma-separated) or a file with species list (required)
-tmp Output file for MAF blocks without reference species (optional, default: no_ref.maf)
-h, --help Show this help message
Examples:
perl 01.convertMaf2List.pl -i 5spp.maf -o 5spp.maf.lst -ref fonta -query baile,smi,rufe,roth
perl 01.convertMaf2List.pl -i 5spp.maf -o 5spp.maf.lst -ref fonta -query species.list
EOF
}
cat 02.lst2gene.pl
perl 02.lst2gene.pl 查看帮助文档
#!/usr/bin/env perl
use strict;
use warnings;
use Getopt::Long;
# 定义变量
my ($lst_file, $gff_file, $outdir, $help);
# 参数解析
GetOptions(
"lst=s" => /$lst_file,
"gff=s" => /$gff_file,
"outdir=s" => /$outdir,
"h|help" => /$help,
) or die "Error in command line arguments/n";
# 如果没给参数或者要求帮助
if (!$lst_file && !$gff_file || $help) {
print_help();
exit;
}
# 默认输出目录
$outdir ||= "genes";
# 创建输出目录
`mkdir -p $outdir` unless -e $outdir;
# 读取GFF
my %gene;
my %pos;
open my $GFF, "<", $gff_file or die "Cannot open $gff_file/n";
while (<$GFF>) {
chomp;
my @a = split(//s+/, $_);
next unless ($a[2] eq "CDS");
my ($chr, $start, $end, $strand) = ($a[0], $a[3], $a[4], $a[6]);
$a[8] =~ /Parent=(/w.*)/;
my $id = $1;
for (my $i = $start; $i <= $end; $i++) {
$pos{$chr}{$i} = $id;
$gene{$id}{pos}{$i} = 0;
}
$gene{$id}{strand} = $strand;
}
close $GFF;
print STDERR "$gff_file loaded.../n";
# 读取.lst文件
my %seq;
open my $LST, "<", $lst_file or die "Cannot open $lst_file/n";
my $head = <$LST>;
my @head = split(//s+/, $head);
my $control = 0;
while (<$LST>) {
my @a = split(//s+/, $_);
my ($chr, $pos) = ($a[0], $a[1]);
next unless exists $pos{$chr}{$pos};
print STDERR "[ $control ] sites loaded.../r" if ($control % 1000 == 0);
my $id = $pos{$chr}{$pos};
for (my $i = 2; $i < @a; $i++) {
my $species = $head[$i];
$a[$i] = uc($a[$i]);
$seq{$id}{$species}{$pos} = $a[$i];
}
$control++;
}
close $LST;
print STDERR "$lst_file loaded.../n";
# 输出每个基因
foreach my $id (sort keys %seq) {
print STDERR "$id/r";
open my $OUT, ">", "$outdir/$id.fa" or die "Cannot write to $outdir/$id.fa/n";
foreach my $species (sort keys %{$seq{$id}}) {
my $nucl;
foreach my $pos (sort { $a <=> $b } keys %{$gene{$id}{pos}}) {
my $base = "-";
$base = $seq{$id}{$species}{$pos} if exists $seq{$id}{$species}{$pos};
$nucl .= $base;
}
my $strand = $gene{$id}{strand};
if ($strand eq "-") {
$nucl =~ tr/ATCGRYMK/TAGCYRKM/;
$nucl = reverse($nucl);
}
print $OUT ">$species/n$nucl/n";
}
close $OUT;
}
print STDERR "/nDone/n";
# 帮助信息
sub print_help {
print < -gff [-outdir ]
Parameters:
-lst Input MAF-LST file (required)
-gff GFF annotation file (required)
-outdir Output directory for gene fasta files (optional, default: genes/)
-h, --help Show this help message
Examples:
perl 02.lst2gene.pl -lst 5spp.maf.lst -gff fonta.gff
perl 02.lst2gene.pl -lst 5spp.maf.lst -gff fonta.gff -outdir genes_output
EOF
}
共有 0 条评论