perl novel可变剪接识别(2)
发布时间:2020-12-16 00:15:10 所属栏目:大数据 来源:网络整理
导读:博主其实对未知的可变剪接分类有些困惑,但想了很久才使用了一项比较复杂的算法,接下来并不是分类,而是先转换数据库,因为ensembl|gencode数据库的格式并不能满足博主的需求,转换后更能方便地处理接下来的工作: #!/usr/bin/env perluse warnings;use str
博主其实对未知的可变剪接分类有些困惑,但想了很久才使用了一项比较复杂的算法,接下来并不是分类,而是先转换数据库,因为ensembl|gencode数据库的格式并不能满足博主的需求,转换后更能方便地处理接下来的工作: #!/usr/bin/env perl use warnings; use strict; my (%gene); open GTF,$ARGV[0] or die $!; while(<GTF>) { chomp; next if(/^#/); my @tmp = split; my ($gid) = $_ =~ /gene_id "([^;]+)";/; #匹配基因名称 if($_ =~ /gene_type "protein_coding";/) #分coding和noncoding分别处理,因为noncoding没有UTR结构,不存在UTR的可变剪接 { if($tmp[2] =~ /exon/) { push @{$gene{$tmp[0]}{$gid}{$tmp[6]}},"$tmp[3],$tmp[4],1"; } }else{ if($tmp[2] =~ /exon/) { push @{$gene{$tmp[0]}{$gid}{$tmp[6]}},0"; } } } close GTF; open OUT,">$ARGV[0].myformat" or die $!; foreach my $c(keys %gene) { foreach my $g(keys %{$gene{$c}}) { foreach my $ps(keys %{$gene{$c}{$g}}) { if($ps eq '+') { my ($s,$e,$type); if(@{$gene{$c}{$g}{$ps}} eq 1) { ($s,$type) = (split /,/,$gene{$c}{$g}{$ps}->[0])[0,1,2]; }else{ $s = (split /,$gene{$c}{$g}{$ps}->[0])[0]; $e = (split /,$gene{$c}{$g}{$ps}->[-1])[1]; $type = (split /,$gene{$c}{$g}{$ps}->[0])[2]; } my (@start,@end); foreach my $loci(@{$gene{$c}{$g}{$ps}}) { push @start,(split /,$loci)[0]; push @end,$loci)[1]; } my $ss = join ",",@start; my $ee = join ",@end; print OUT join "t",$c,$g,"$s-$e",$ps,$type,$ss,$ee,"n"; }else{ my @ps_a = reverse @{$gene{$c}{$g}{$ps}}; my ($s,$type); if(@ps_a eq 1) { ($s,$ps_a[0])[0,$ps_a[0])[0]; $e = (split /,$ps_a[-1])[1]; $type = (split /,$ps_a[0])[2]; } my (@start,@end); foreach my $loci(@ps_a) { push @start,"n"; } } } } 这个格式类似ucsc的refseq格式,当然只是类似而已,而博主所需求的是gene类型和exon起始与终止而已。 说到数据库的转换,博主还想起来一件事情,就是refseq转gff格式,其实两者差的有些多。 有个妹子写了一个转换的,还不错,展示的内容非常的详细想要啥结果都可以,当然在这之中博主也贡献了一点功劳,嘿嘿!就拿来在这展示一下吧: #!perl -w use strict; die "Usage : perl $0 <in.refGene.lst> <out.gff>" unless (@ARGV == 2); my ($in,$out) = @ARGV; my($pre,$insert,@inserts,$utr,$utr_o,$i,$j,$cds,$nm,$chr,$direction,$start_exon,$end_exon,$start_cds,$end_cds,$cds_num,$start,$end,$tmp,$gene,@starts,@ends,$up,$down); if ($in =~ /.gz/){open IN," gzip -dc $in | " || die $!;} else{open IN,$in || die $!;} if ($out =~ /.gz/){open OUT,"| gzip > $out" || die $!;} else {open OUT,"> $out" || die $!;} while (<IN>){ chomp; ($nm,$insert) = (split)[1..12,15]; if ($nm =~ /NM/){ $pre = 'mRNA'; }else{ $pre = 'ncRNA'; } $start =~ s/,$//; $end =~ s/,$//; $insert =~ s/,$//; $insert =~ s/-1/./g; if ($cds_num > 1){ @starts = split /,$start; @ends = split /,$end; @inserts = split /,$insert; }else{ @starts = ($start); @ends = ($end); @inserts = ($insert); } print OUT "$chrtrefGenet$pret$start_exont$end_exont.t$directiont.tID=$nm; name=$gene;n"; if ($direction eq '+'){ $utr = 5; $utr_o = 3; #print OUT "$chrtrefGenet5-UTRt$start_exont",$start_cds-1,"t.t$directiont.tParent=$nm;n"; } else { $utr = 3; $utr_o = 5; #print OUT "$chrtrefGenet3-UTRt$start_exont","t.t$directiont.tParent=$nm;n"; } for ($i = 0; $i < @starts; $i ++){ print OUT "$chrtrefGenetintront",$ends[$i-1]+1,"t",$starts[$i]-1,"t.t$directiont.tParent=$nm;n" if ($i > 0); if ($pre eq 'ncRNA'){ print OUT "$chrtrefGenetCDSt$starts[$i]t$ends[$i]t.t$directiont$inserts[$i]tParent=$nm;n"; next; } if ($ends[$i] < $start_cds){ print OUT "$chrtrefGenet$utr-UTRt$starts[$i]t$ends[$i]t.t$directiont$inserts[$i]tParent=$nm;n"; }elsif($starts[$i] < $start_cds and $ends[$i] > $start_cds){ print OUT "$chrtrefGenet$utr-UTRt$starts[$i]t",$start_cds - 1,"t.t$directiont.tParent=$nm;n"; print OUT "$chrtrefGenetCDSt$start_cdst$ends[$i]t.t$directiont$inserts[$i]tParent=$nm;n"; }elsif($starts[$i] >= $start_cds and $ends[$i] <= $end_cds){ print OUT "$chrtrefGenetCDSt$starts[$i]t$ends[$i]t.t$directiont$inserts[$i]tParent=$nm;n"; }elsif($starts[$i] < $end_cds and $ends[$i] > $end_cds){ print OUT "$chrtrefGenetCDSt$starts[$i]t$end_cdst.t$directiont$inserts[$i]tParent=$nm;n"; print OUT "$chrtrefGenet$utr_o-UTRt",$end_cds + 1,"t$ends[$i]t.t$directiont.tParent=$nm;n"; }else{ print OUT "$chrtrefGenet$utr_o-UTRt$starts[$i]t$ends[$i]t.t$directiont$inserts[$i]tParent=$nm;n"; } } } close IN; close OUT; 上面是正常的gff格式,下面将添加一些别的东西: #!perl -w use strict; die "Usage : perl $0 <in.file> <out.file> " if (@ARGV > 2); my ($in,$out) = @ARGV; $in ||= 'refGene.sort.2.gz'; $out ||= 'refGene.ann.gz'; if($in =~ /.gz/){ open IN,"gzip -dc $in |" || die $!; }else{ open IN,$in || die $!; } if ($out =~ /.gz/){ open OUT,"| gzip > $out " || die $!; }else{ open OUT,"> $out" || die $!; } my (@tmps,$ref,$region,$num,$dot,$id,$count_cds,$count_intron,$nm); my (%infos,%genes); my @chrs = (1..22,'X','Y'); $/ = "n>"; chomp (my $line = <IN>); @tmps = split /n/,$line; my ($chr_b,$end_b,$orientation,$id_b) = $tmps[0] =~ /(chrw+)tw+tw+RNAtd+t(d+)t.t([+-])t.t(ID=S+; name=S+)$/; $tmps[0] =~ s/>//; print OUT "$tmps[0]n"; @tmps[1..$#tmps] = &add_num ($orientation,@tmps[1..$#tmps]); print OUT join "n",@tmps[1..$#tmps]; print OUT "n"; while (<IN>){ chomp; @tmps = split /n/,$_; ($chr,$id) = $tmps[0] =~ /(chrw+)tw+tw+RNAt(d+)t(d+)t.t([+-])t.t(ID=S+; name=S+)$/; if ($chr eq $chr_b and $end_b < $start){ print OUT "$chrtrefGenetintergenict",$end_b + 1,$start - 1,"t.t.t.t$id_b|$idn"; } ($chr_b,$id_b) = ($chr,$id); $tmps[0] =~ s/>//; print OUT "$tmps[0]n"; @tmps[1..$#tmps] = &add_num ($orientation,@tmps[1..$#tmps]); print OUT join "n",@tmps[1..$#tmps]; print OUT "n"; } close IN; close OUT; #$/ = 'n'; sub add_num { my @tmps = @_; my ($count_cds,$count_intron) = (0,0); if ($tmps[0] eq '+'){ for my $tmp (@tmps[1..$#tmps]){ if ($tmp =~ /CDS/){ $count_cds ++; $tmp =~ s/.t+/$count_cdst+/; #$count_cds ++; }elsif($tmp =~ /intron/){ $count_intron ++; $tmp =~ s/.t+/$count_intront+/; #$count_intron ++; } } return @tmps[1..$#tmps]; } #return @tmps[1..$#tmps]; for my $tmp (@tmps[1..$#tmps]){ if ($tmp =~ /CDS/){ $count_cds ++; }elsif($tmp =~ /intron/){ $count_intron ++; } } for my $tmp (@tmps[1..$#tmps]){ if ($tmp =~ /CDS/){ $tmp =~ s/.t-/$count_cdst-/; $count_cds --; }elsif($tmp =~ /intron/){ $tmp =~ s/.t-/$count_intront-/; $count_intron --; } } return @tmps[1..$#tmps]; } 脚本如下: less refGene.txt.gz | sort -k3,3 -k5,5n |gzip > refGene.txt.sort.gz perl format.pl refGene.txt.sort.gz refGene.gff.gz less refGene.gff.gz |perl -lane 'if($F[2] =~ /RNA/){$F[0] = ">$F[0]";}print join "t",@F[0..7],"$F[8] $F[9]";' |gzip > refGene.gff.2.gz perl ann.sort.pl refGene.gff.2.gz refGene.ann.gz 最终的转换格式如下: chr1 refGene intron 763230 764381 1 + . Parent=NR_047525; chr1 refGene CDS 764382 764484 2 + . Parent=NR_047525; chr1 refGene intron 764485 787305 2 + . Parent=NR_047525; chr1 refGene CDS 787306 787490 3 + . Parent=NR_047525; chr1 refGene intron 787491 788049 3 + . Parent=NR_047525; chr1 refGene CDS 788050 788146 4 + . Parent=NR_047525; chr1 refGene intron 788147 788769 4 + . Parent=NR_047525; chr1 refGene CDS 788770 794826 5 + . Parent=NR_047525; chr1 refGene intergenic 794827 803449 . . . ID=NR_047525; name=LOC643837;|ID=NR_027055; name=FAM41C; chr1 refGene ncRNA 803450 812182 . - . ID=NR_027055; name=FAM41C; chr1 refGene CDS 803450 804055 3 - . Parent=NR_027055; chr1 refGene intron 804056 809490 2 - . Parent=NR_027055; chr1 refGene CDS 809491 810535 2 - . Parent=NR_027055; chr1 refGene intron 810536 812124 1 - . Parent=NR_027055; chr1 refGene CDS 812125 812182 1 - . Parent=NR_027055; chr1 refGene intergenic 812183 852951 . . . ID=NR_027055; name=FAM41C;|ID=NR_026874; name=LOC100130417; chr1 refGene ncRNA 852952 854817 . - . ID=NR_026874; name=LOC100130417; chr1 refGene CDS 852952 853100 4 - . Parent=NR_026874; chr1 refGene intron 853101 853400 3 - . Parent=NR_026874; chr1 refGene CDS 853401 853555 3 - . Parent=NR_026874; chr1 refGene intron 853556 854203 2 - . Parent=NR_026874; chr1 refGene CDS 854204 854295 2 - . Parent=NR_026874; chr1 refGene intron 854296 854713 1 - . Parent=NR_026874; chr1 refGene CDS 854714 854817 1 - . Parent=NR_026874; chr1 refGene intergenic 854818 861119 . . . ID=NR_026874; name=LOC100130417;|ID=NM_152486; name=SAMD11; chr1 refGene mRNA 861120 879961 . + . ID=NM_152486; name=SAMD11; chr1 refGene 5-UTR 861120 861180 . + . Parent=NM_152486; chr1 refGene intron 861181 861300 1 + . Parent=NM_152486; chr1 refGene 5-UTR 861301 861320 . + . Parent=NM_152486; chr1 refGene CDS 861321 861393 1 + 0 Parent=NM_152486; 第6列为exon和intron的排列顺序,第8列为翻译偏移量,intergenic为添加两个gene或transcript之间的距离等,加强版的gff格式~~~ (编辑:李大同) 【声明】本站内容均来自网络,其相关言论仅代表作者个人观点,不代表本站立场。若无意侵犯到您的权利,请及时与联系站长删除相关内容! |