lastal – 多基因组比对

-1.ref
https://github.com/gotouerina/Comparative-genomics-script/wiki/03.LASTAL-Alignment-Pipeline
https://gitlab.com/mcfrith/last
https://gitlab.com/mcfrith/last/-/blob/main/doc/last-papers.rst

0.开始之前
最好对用到的基因组的序列名进行重新命名,举几个例子:
Gallus_gallus.chr1
Aegithalos_caudatus.scaffold_8
Zosterops_lateralis.scaffold_19
这个步骤不再赘述
软件的安装不再赘述

1.对参考基因组建库, 参数可以照抄
lastdb -P8 -uMAM8 ref/chicken Gallus_gallus.fa

2.为基因组比对寻找合适的参数(对你的每个物种重复这个过程)
last-train -P8 --revsym -C2 ref/chicken Zosterops_lateralis.fa > train/Zosterops_lateralis.train

3.进行基因组比对(对你的每个物种重复这个过程)
lastal -P5 -p train/Zosterops_lateralis.train ref/chicken Zosterops_lateralis.fa | last-split -fMAF+ > maf/Zosterops_lateralis.maf
maf-swap maf/Zosterops_lateralis.maf | last-split | maf-swap | maf-sort > maf/Zosterops_lateralis.sort.maf
这里其实就比对结束,为了后续使用multiz,还需要对表头的注释行进行修改

grep -v '^#' maf/Zosterops_lateralis.sort.maf | awk 'BEGIN{print ##maf version=1}{print}' > maf/Gallus_gallus.Zosterops_lateralis.sing.maf
对表头的注释行进行修改

Gallus_gallus.Zosterops_lateralis.sing.maf就是后续multiz需要用到的文件
命名格式是:
ref基因组.query基因组.sing.maf

4.合并多基因组比对(自行安装Multiz)

./roast - T=/path/to/workdir   E=Gallus_gallus  "((Zosterops_lateralis sppA) Gallus_gallus)" Gallus_gallus.*.sing.maf  ref_species_mulitway.maf > roast.sh
Gallus_gallus.*.sing.maf指定了所有的输入文件
#ref_species_mulitway.maf是最后的输出文件
#"((Zosterops_lateralis sppA) Gallus_gallus)"是你所有物种的系统发生关系,
#和传统的nwk文件类似,但是去除了所有枝长,并且把逗号变为空格

sh roast.sh
#运行完毕即可
#输出ref_species_mulitway.maf,就是最后的多基因组比对maf文件

5.Maf.to.gene
原始的脚本来自文章"10.1126/science.aav6202"
使用GPT进行了修改,主要添加了帮助文档和更详细的参数说明
cat 01.convertMaf2List.pl
perl 01.convertMaf2List.pl 查看帮助说明

#!/usr/bin/env perl
use strict;
use warnings;
use Getopt::Long;

# 定义变量
my ($maf_file, $output_file, $ref_species, $query_input, $tmp_file, $help);

# 解析参数
GetOptions(
    "i=s"     => /$maf_file,
    "o=s"     => /$output_file,
    "ref=s"   => /$ref_species,
    "query=s" => /$query_input,
    "tmp=s"   => /$tmp_file,
    "h|help"  => /$help,
) or die "Error in command line arguments/n";

# 如果什么参数都没给,或者显式要求帮助
if (!$maf_file && !$output_file && !$ref_species && !$query_input && !$tmp_file || $help) {
    print_help();
    exit;
}

# 检查必需参数
die "Missing -i input file/n" unless $maf_file;
die "Missing -o output file/n" unless $output_file;
die "Missing -ref reference species/n" unless $ref_species;
die "Missing -query query species/n" unless $query_input;

# 设定默认的tmp输出
$tmp_file ||= "no_ref.maf";

# 处理query输入(逗号分割 或 文件)
my @species;
if (-e $query_input) {
    open my $fh, "<", $query_input or die "Cannot open $query_input/n";
    while (<$fh>) {
        chomp;
        next if /^$/;
        push @species, $_;
    }
    close $fh;
} else {
    @species = split(/,/, $query_input);
}

# 打开输出文件
open my $O, ">", $output_file or die "Cannot open $output_file/n";
open my $E, ">", $tmp_file or die "Cannot open $tmp_file/n";

# 打印表头
my @head = ("chr", "pos", @species);
print $O join("/t", @head), "/n";

# 读取maf文件
open my $I, "-|", "cat $maf_file" or die "Cannot open $maf_file/n";
my $content = "";
while (<$I>) {
    next if(/^#/);
    if (/^a/s+score=/) {
        $content = $_;
        while (<$I>) {
            if (/^s/s+/) {
                $content .= $_;
            } else {
                last;
            }
        }
        analysis($content, $O, $E, $ref_species, /@species);
    }
}
close $I;
close $O;
close $E;

# 子程序:解析一段alignment block
sub analysis {
    my ($content, $O, $E, $ref, $species_ref) = @_;
    my @line = split(//n/, $content);
    shift @line; # 删除a行
    my %pos;
    my $isRefFound = 0;
    my $ref_chr = "NA";

    foreach my $line (@line) {
        my @a = split(//s+/, $line);
        my ($s, $chr, $start, $alignment_length, $strand, $sequence_length, $sequence) = @a;
        $chr =~ /^([^/.]+)/.(.*)/;
        my $species = $1;
        $chr = $2;
        if ($species eq $ref) {
            $ref_chr = $chr;
            $isRefFound = 1;
            my @base = split(//, $sequence);
            if ($strand eq "+") {
                my $pos = $start;
                for (my $i = 0; $i < @base; $i++) {
                    if ($base[$i] ne "-") {
                        $pos++;
                        $pos{$i} = $pos;
                    }
                }
            } elsif ($strand eq "-") {
                my $pos = $start;
                for (my $i = 0; $i < @base; $i++) {
                    if ($base[$i] ne "-") {
                        $pos++;
                        my $real_pos = $sequence_length - 1 - ($pos - 1) + 1;
                        $pos{$i} = $real_pos;
                    }
                }
            }
        }
    }

    if ($isRefFound == 0) {
        print $E $content;
        return;
    }

    my %data;
    foreach my $line (@line) {
        my @a = split(//s+/, $line);
        my ($s, $chr, $start, $alignment_length, $strand, $sequence_length, $sequence) = @a;
        $chr =~ /^([^/.]+)/.(.*)/;
        my $species = $1;
        $chr = $2;
        my @base = split(//, $sequence);
        $sequence =~ tr/ATCGRYMK/TAGCYRKM/ if ($strand eq "-");
        for (my $i = 0; $i < @base; $i++) {
            next unless exists $pos{$i};
            my $pos = $pos{$i};
            $data{$pos}{$species} = $base[$i];
        }
    }

    foreach my $pos (sort { $a <=> $b } keys %data) {
        my @output_line = ($ref_chr, $pos);
        foreach my $species (@$species_ref) {
            my $base = "-";
            $base = $data{$pos}{$species} if exists $data{$pos}{$species};
            push @output_line, $base;
        }
        print $O join("/t", @output_line), "/n";
    }
}

# 打印帮助信息
sub print_help {
    print < -o  -ref  -query  [-tmp ]

Parameters:
    -i       Input MAF file (required)
    -o       Output LST file (required)
    -ref     Reference species name (required)
    -query   Query species names (comma-separated) or a file with species list (required)
    -tmp     Output file for MAF blocks without reference species (optional, default: no_ref.maf)
    -h, --help  Show this help message

Examples:
    perl 01.convertMaf2List.pl -i 5spp.maf -o 5spp.maf.lst -ref fonta -query baile,smi,rufe,roth
    perl 01.convertMaf2List.pl -i 5spp.maf -o 5spp.maf.lst -ref fonta -query species.list

EOF
}

cat 02.lst2gene.pl
perl 02.lst2gene.pl 查看帮助文档

#!/usr/bin/env perl
use strict;
use warnings;
use Getopt::Long;

# 定义变量
my ($lst_file, $gff_file, $outdir, $help);

# 参数解析
GetOptions(
    "lst=s"   => /$lst_file,
    "gff=s"   => /$gff_file,
    "outdir=s" => /$outdir,
    "h|help"  => /$help,
) or die "Error in command line arguments/n";

# 如果没给参数或者要求帮助
if (!$lst_file && !$gff_file || $help) {
    print_help();
    exit;
}

# 默认输出目录
$outdir ||= "genes";

# 创建输出目录
`mkdir -p $outdir` unless -e $outdir;

# 读取GFF
my %gene;
my %pos;
open my $GFF, "<", $gff_file or die "Cannot open $gff_file/n";
while (<$GFF>) {
    chomp;
    my @a = split(//s+/, $_);
    next unless ($a[2] eq "CDS");
    my ($chr, $start, $end, $strand) = ($a[0], $a[3], $a[4], $a[6]);
    $a[8] =~ /Parent=(/w.*)/;
    my $id = $1;
    for (my $i = $start; $i <= $end; $i++) {
        $pos{$chr}{$i} = $id;
        $gene{$id}{pos}{$i} = 0;
    }
    $gene{$id}{strand} = $strand;
}
close $GFF;
print STDERR "$gff_file loaded.../n";

# 读取.lst文件
my %seq;
open my $LST, "<", $lst_file or die "Cannot open $lst_file/n";
my $head = <$LST>;
my @head = split(//s+/, $head);
my $control = 0;
while (<$LST>) {
    my @a = split(//s+/, $_);
    my ($chr, $pos) = ($a[0], $a[1]);
    next unless exists $pos{$chr}{$pos};
    print STDERR "[ $control ] sites loaded.../r" if ($control % 1000 == 0);
    my $id = $pos{$chr}{$pos};
    for (my $i = 2; $i < @a; $i++) {
        my $species = $head[$i];
        $a[$i] = uc($a[$i]);
        $seq{$id}{$species}{$pos} = $a[$i];
    }
    $control++;
}
close $LST;
print STDERR "$lst_file loaded.../n";

# 输出每个基因
foreach my $id (sort keys %seq) {
    print STDERR "$id/r";
    open my $OUT, ">", "$outdir/$id.fa" or die "Cannot write to $outdir/$id.fa/n";
    foreach my $species (sort keys %{$seq{$id}}) {
        my $nucl;
        foreach my $pos (sort { $a <=> $b } keys %{$gene{$id}{pos}}) {
            my $base = "-";
            $base = $seq{$id}{$species}{$pos} if exists $seq{$id}{$species}{$pos};
            $nucl .= $base;
        }
        my $strand = $gene{$id}{strand};
        if ($strand eq "-") {
            $nucl =~ tr/ATCGRYMK/TAGCYRKM/;
            $nucl = reverse($nucl);
        }
        print $OUT ">$species/n$nucl/n";
    }
    close $OUT;
}
print STDERR "/nDone/n";

# 帮助信息
sub print_help {
    print < -gff  [-outdir ]

Parameters:
    -lst       Input MAF-LST file (required)
    -gff       GFF annotation file (required)
    -outdir    Output directory for gene fasta files (optional, default: genes/)
    -h, --help Show this help message

Examples:
    perl 02.lst2gene.pl -lst 5spp.maf.lst -gff fonta.gff
    perl 02.lst2gene.pl -lst 5spp.maf.lst -gff fonta.gff -outdir genes_output

EOF
}

版权声明:
作者:cc
链接:https://www.techfm.club/p/209647.html
来源:TechFM
文章版权归作者所有,未经允许请勿转载。

THE END
分享
二维码
< <上一篇
下一篇>>