原始数据:
基因组的fasta文件,两个基因组之间blast的比对坐标
scaffold1 2723 9022 contig00005 238996 1
解决方法一:
#!/usr/bin/perl -w use strict;use warnings;no strict "refs";my $usage = " perl mutation-vs-wild.m8 mutation.fa wild.fa \n";unless(@ARGV == 3){ dIE $usage; }my $aln = shift ;my $mut = shift;my $wild = shift;open ALN,"<",$aln or dIE "can't open $aln\n";#open MUT,$mut or dIE "can't open $mut\n";#open WILD,$wild or dIE "can't open $wild\n";my %mut = build_fa($mut);my %wild = build_fa($wild);coden(\%mut);coden(\%wild);while(<ALN>){ chomp; my ($scf,$st1,$ed1,$ctg,$st2,$ed2) = split"\t",$_; coden_aln($scf,$ed1); coden_aln($ctg,$ed2); }my %want_mut = get_coord(\%mut);my %want_wild = get_coord(\%wild);get_fa_seq ("mut",\%want_mut,\%mut);get_fa_seq ("wild",\%want_wild,\%wild);sub build_fa { my $file = shift @_; my %fa; my $tit; open FA,$file or dIE "can't open $file\n"; while(<FA>){ chomp; if(/>(.*)/){ $tit = ; }else{ $fa{$tit} .= $_; } } return %fa;}sub coden { my $hash = shift @_; foreach my $key (keys %$hash){ foreach my $i (1 .. length $hash->{$key}){ ${$key}[$i] = 0; } }}sub coden_aln { my ($scf,$st,$ed) = @_; if($st > $ed){ ($st,$ed) = ($ed,$st); } foreach my $i ($st .. $ed){ ${$scf}[$i] =1; }}sub get_coord { my %want; my $hash = shift @_; my $flag; foreach my $key (keys %$hash){ my $start; my $end; if($key =~/scaffold/){ $flag = "mut"; }else{ $flag = "wild"; } foreach my $i (1.. length $hash->{$key}){ if($i == 1 && ${$key}[1] == 0){ $start = 1; }elsif(${$key}[$i] == 0 && ${$key}[$i-1] ==1){ $start = $i; } if($i == length $hash->{$key} && ${$key}[$i]==0 ){ $end = $i; }elsif(${$key}[$i] == 0 && ${$key}[$i+1] == 1){ $end = $i; } if($start && $end && $start <= $end ){ $want{$key}{$start} = $end; } } } open MUT_CORD,">",$flag."_coord"; foreach my $key (keys %want){ foreach my $st (keys %{$want{$key}}){ my $end = $want{$key}{$st}; print MUT_CORD $key,"\t",$want{$key}{$st},"\n"; } } close MUT_CORD; return %want;}sub get_fa_seq { my ($flag,$want,$fa) = @_; open MUT_FA,$flag."_uniq.fa"; foreach my $key (keys %$fa){ foreach my $cord (keys %{$want->{$key}}){ my $start = $cord; my $end = $want->{$key}{$cord}; my $seq =substr ($fa->{$key},$start-1,$end-$start+1); print MUT_FA ">$key $start->$end\n$seq\n"; } } close MUT_FA;}总结
以上是内存溢出为你收集整理的Perl--寻找两个基因组中的unique序列全部内容,希望文章能够帮你解决Perl--寻找两个基因组中的unique序列所遇到的程序开发问题。
如果觉得内存溢出网站内容还不错,欢迎将内存溢出网站推荐给程序员好友。
欢迎分享,转载请注明来源:内存溢出
评论列表(0条)