Beginning Perl for Bioinformatics 总结提升
发布时间:2020-12-15 23:43:32 所属栏目:大数据 来源:网络整理
导读:1 Biology and Computer Science2 Getting Started with Perl3 The Art of Programming4 Sequences and Strings 5 Motifs and LoopsExample 5-3. Searching for motifs #!/usr/bin/perl -w # Searching for motifs # Ask the user for the filename of the f
1 Biology and Computer Science 2 Getting Started with Perl 3 The Art of Programming 4 Sequences and Strings 5 Motifs and Loops Example 5-3. Searching for motifs #!/usr/bin/perl -w # Searching for motifs # Ask the user for the filename of the file containing # the protein sequence data,and collect it from the keyboard print "Please type the filename of the protein sequence data: "; $proteinfilename = <STDIN>; # Remove the newline from the protein filename chomp $proteinfilename; # open the file,or exit unless ( open(PROTEINFILE,$proteinfilename) ) { print "Cannot open file "$proteinfilename"nn"; exit; } # Read the protein sequence data from the file,and store it # into the array variable @protein @protein = <PROTEINFILE>; # Close the file - we've read all the data into @protein now. close PROTEINFILE; # Put the protein sequence data into a single string,as it's easier # to search for a motif in a string than in an array of # lines (what if the motif occurs over a line break?) $protein = join( '',@protein); # Remove whitespace $protein =~ s/s//g; # In a loop,ask the user for a motif,search for the motif,# and report if it was found. # Exit if no motif is entered. do { print "Enter a motif to search for: "; $motif = <STDIN>; # Remove the newline at the end of $motif chomp $motif; # Look for the motif if ( $protein =~ /$motif/ ) { print "I found it!nn"; } else { print "I couldn't find it.nn"; } # exit on an empty user input } until ( $motif =~ /^s*$/ ); # exit the program exit; #!/usr/bin/perl -w # Determining frequency of nucleotides,take 2 # Get the DNA sequence data print "Please type the filename of the DNA sequence data: "; $dna_filename = <STDIN>; chomp $dna_filename; # Does the file exist? unless ( -e $dna_filename) { print "File "$dna_filename" doesn't seem to exist!!n"; exit; } # Can we open the file? unless ( open(DNAFILE,$dna_filename) ) { print "Cannot open file "$dna_filename"nn"; exit; } @DNA = <DNAFILE>; close DNAFILE; $DNA = join( '',@DNA); # Remove whitespace $DNA =~ s/s//g; # Initialize the counts. # Notice that we can use scalar variables to hold numbers. $count_of_A = 0; $count_of_C = 0; $count_of_G = 0; $count_of_T = 0; $errors = 0; # In a loop,look at each base in turn,determine which of the # four types of nucleotides it is,and increment the # appropriate count. for ( $position = 0 ; $position < length $DNA ; ++$position ) { $base = substr($DNA,$position,1); if ( $base eq 'A' ) { ++$count_of_A; } elsif ( $base eq 'C' ) { ++$count_of_C; } elsif ( $base eq 'G' ) { ++$count_of_G; } elsif ( $base eq 'T' ) { ++$count_of_T; } else { print "!!!!!!!! Error - I don't recognize this base: $basen"; ++$errors; } } # print the results print "A = $count_of_An"; print "C = $count_of_Cn"; print "G = $count_of_Gn"; print "T = $count_of_Tn"; print "errors = $errorsn"; # exit the program exit; Here's the output of Example 5-6: Please type the filename of the DNA sequence data: small.dna !!!!!!!! Error - I don't recognize this vase: V A = 40 C = 27 G = 24 T = 17 errors = 1 6 Subroutines and Bugs 7 Mutations and Randomization 8 The Genetic Code 9 Restriction Maps and Regular Expressions 10 GenBank Extract annotation and sequence from GenBank file #!/usr/bin/perl # Extract annotation and sequence from GenBank file use strict; use warnings; use BeginPerlBioinfo; # see Chapter 6 about this module # declare and initialize variables my @annotation = ( ); my $sequence = ''; my $filename = 'record.gb'; parse1(@annotation,$sequence,$filename); # Print the annotation,and then # print the DNA in new format just to check if we got it okay. print @annotation; print_sequence($sequence,50); exit; ####################### sub parse1 { my($annotation,$dna,$filename) = @_; # $annotation--reference to array # $dna --reference to scalar # $filename --scalar # declare and initialize variables my $in_sequence = 0; my @GenBankFile = ( ); # Get the GenBank data into an array from a file @GenBankFile = get_file_data($filename); # Extract all the sequence lines foreach my $line (@GenBankFile) { if( $line =~ /^//n/ ) { # If $line is end-of-record line //n,last; #break out of the foreach loop. } elsif( $in_sequence) { # If we know we're in a sequence,$$dna .= $line; # add the current line to $$dna. } elsif ( $line =~ /^ORIGIN/ ) { # If $line begins a sequence,$in_sequence = 1; # set the $in_sequence flag. } else{ # Otherwise push( @$annotation,$line); # add the current line to @annotation. } } # remove whitespace and line numbers from DNA sequence $$dna =~ s/[s0-9]//g; } Extract annotation and sequence from Genbank record #!/usr/bin/perl # Extract the annotation and sequence sections from the first # record of a GenBank library use strict; use warnings; use BeginPerlBioinfo; # Declare and initialize variables my $annotation = ''; my $dna = ''; my $record = ''; my $filename = 'record.gb'; my $save_input_separator = $/; # Open GenBank library file unless (open(GBFILE,$filename)) { print "Cannot open GenBank file "$filename"nn"; exit; } # Set input separator to "//n" and read in a record to a scalar $/ = "//n"; $record = <GBFILE>; # reset input separator $/ = $save_input_separator; # Now separate the annotation from the sequence data ($annotation,$dna) = ($record =~ /^(LOCUS.*ORIGINs*n)(.*)//n/s); # Print the two pieces,which should give us the same as the # original GenBank file,minus the // at the end print $annotation,$dna; exit; Parsing GenBank annotations using arrays #!/usr/bin/perl # Parsing GenBank annotations using arrays use strict; use warnings; use BeginPerlBioinfo; # see Chapter 6 about this module # Declare and initialize variables my @genbank = ( ); my $locus = ''; my $accession = ''; my $organism = ''; # Get GenBank file data @genbank = get_file_data('record.gb'); # Let's start with something simple. Let's get some of the identifying # information,let's say the locus and accession number (here the same # thing) and the definition and the organism. for my $line (@genbank) { if($line =~ /^LOCUS/) { $line =~ s/^LOCUSs*//; $locus = $line; }elsif($line =~ /^ACCESSION/) { $line =~ s/^ACCESSIONs*//; $accession = $line; }elsif($line =~ /^ ORGANISM/) { $line =~ s/^s*ORGANISMs*//; $organism = $line; } } print "*** LOCUS ***n"; print $locus; print "*** ACCESSION ***n"; print $accession; print "*** ORGANISM ***n"; print $organism; exit; Listing the contents of a folder (or directory) #!/usr/bin/perl # Demonstrating how to open a folder and list its contents use strict; use warnings; use BeginPerlBioinfo; # see Chapter 6 about this module my @files = ( ); my $folder = 'pdb'; # open the folder unless(opendir(FOLDER,$folder)) { print "Cannot open folder $folder!n"; exit; } # read the contents of the folder (i.e. the files and subfolders) @files = readdir(FOLDER); # close the folder closedir(FOLDER); # print them out,one per line print join( "n",@files),"n"; exit; Extract sequence chains from PDB file #!/usr/bin/perl # Extract sequence chains from PDB file use strict; use warnings; use BeginPerlBioinfo; # Read in PDB file: Warning - some files are very large! my @file = get_file_data('pdb/c1/pdb1c1f.ent'); # Parse the record types of the PDB file my %recordtypes = parsePDBrecordtypes(@file); # Extract the amino acid sequences of all chains in the protein my @chains = extractSEQRES( $recordtypes{'SEQRES'} ); # Translate the 3-character codes to 1-character codes,and print foreach my $chain (@chains) { print "****chain $chain **** n"; print "$chainn"; print iub3to1($chain),"n"; } exit; # parsePDBrecordtypes # #--given an array of a PDB file,return a hash with # keys = record type names # values = scalar containing lines for that record type sub parsePDBrecordtypes { my @file = @_; use strict; use warnings; my %recordtypes = ( ); foreach my $line (@file) { # Get the record type name which begins at the # start of the line and ends at the first space my($recordtype) = ($line =~ /^(S+)/); # .= fails if a key is undefined,so we have to # test for definition and use either .= or = depending if(defined $recordtypes{$recordtype} ) { $recordtypes{$recordtype} .= $line; }else{ $recordtypes{$recordtype} = $line; } } return %recordtypes; } # extractSEQRES # #--given an scalar containing SEQRES lines,# return an array containing the chains of the sequence sub extractSEQRES { use strict; use warnings; my($seqres) = @_; my $lastchain = ''; my $sequence = ''; my @results = ( ); # make array of lines my @record = split ( /n/,$seqres); foreach my $line (@record) { # Chain is in column 12,residues start in column 20 my ($thischain) = substr($line,11,1); my($residues) = substr($line,19,52); # add space at end # Check if a new chain,or continuation of previous chain if("$lastchain" eq "") { $sequence = $residues; }elsif("$thischain" eq "$lastchain") { $sequence .= $residues; # Finish gathering previous chain (unless first record) }elsif ( $sequence ) { push(@results,$sequence); $sequence = $residues; } $lastchain = $thischain; } # save last chain push(@results,$sequence); return @results; } # iub3to1 # #--change string of 3-character IUB amino acid codes (whitespace separated) # into a string of 1-character amino acid codes sub iub3to1 { my($input) = @_; my %three2one = ( 'ALA' => 'A','VAL' => 'V','LEU' => 'L','ILE' => 'I','PRO' => 'P','TRP' => 'W','PHE' => 'F','MET' => 'M','GLY' => 'G','SER' => 'S','THR' => 'T','TYR' => 'Y','CYS' => 'C','ASN' => 'N','GLN' => 'Q','LYS' => 'K','ARG' => 'R','HIS' => 'H','ASP' => 'D','GLU' => 'E',); # clean up the input $input =~ s/n/ /g; my $seq = ''; # This use of split separates on any contiguous whitespace my @code3 = split(' ',$input); foreach my $code (@code3) { # A little error checking if(not defined $three2one{$code}) { print "Code $code not definedn"; next; } $seq .= $three2one{$code}; } return $seq; } 12 BLAST Extract annotation and alignments from BLAST output file #!/usr/bin/perl # Extract annotation and alignments from BLAST output file use strict; use warnings; use BeginPerlBioinfo; # declare and initialize variables my $beginning_annotation = ''; my $ending_annotation = ''; my %alignments = ( ); my $filename = 'blast.txt'; parse_blast($beginning_annotation,$ending_annotation,%alignments,and then # print the DNA in new format just to check if we got it okay. print $beginning_annotation; foreach my $key (keys %alignments) { print "$keynXXXXXXXXXXXXn",$alignments{$key},"nXXXXXXXXXXXn"; } print $ending_annotation; exit; # parse_blast # # --parse beginning and ending annotation,and alignments,# from BLAST output file sub parse_blast { my($beginning_annotation,$ending_annotation,$alignments,$filename) = @_; # $beginning_annotation--reference to scalar # $ending_annotation --reference to scalar # $alignments --reference to hash # $filename --scalar # declare and initialize variables my $blast_output_file = ''; my $alignment_section = ''; # Get the BLAST program output into an array from a file $blast_output_file = join( '',get_file_data($filename)); # Extract the beginning annotation,alignments,and ending annotation ($$beginning_annotation,$alignment_section,$$ending_annotation) = ($blast_output_file =~ /(.*^ALIGNMENTSn)(.*)(^ Database:.*)/ms); # Populate %alignments hash # key = ID of hit # value = alignment section %$alignments = parse_blast_alignment($alignment_section); } # parse_blast_alignment # # --parse the alignments from a BLAST output file,# return hash with # key = ID # value = text of alignment sub parse_blast_alignment { my($alignment_section) = @_; # declare and initialize variables my(%alignment_hash) = ( ); # loop through the scalar containing the BLAST alignments,# extracting the ID and the alignment and storing in a hash # # The regular expression matches a line beginning with >,# and containing the ID between the first pair of | characters; # followed by any number of lines that don't begin with > while($alignment_section =~ /^>.*n(^(?!>).*n)+/gm) { my($value) = $&; my($key) = (split(/|/,$value)) [1]; $alignment_hash{$key} = $value; } return %alignment_hash; } 13 Further Topics (编辑:李大同) 【声明】本站内容均来自网络,其相关言论仅代表作者个人观点,不代表本站立场。若无意侵犯到您的权利,请及时与联系站长删除相关内容! |