Beginning Perl for Bioinformatics 总结提升

发布时间：2020-12-15 23:43:32 所属栏目：大数据来源：网络整理

导读：1 Biology and Computer Science2 Getting Started with Perl3 The Art of Programming4 Sequences and Strings 5 Motifs and LoopsExample 5-3. Searching for motifs #!/usr/bin/perl -w # Searching for motifs # Ask the user for the filename of the f

1 Biology and Computer Science
2 Getting Started with Perl
3 The Art of Programming
4 Sequences and Strings

5 Motifs and Loops
Example 5-3. Searching for motifs 
#!/usr/bin/perl -w 
# Searching for motifs 
# Ask the user for the filename of the file containing 
# the protein sequence data,and collect it from the keyboard 
print "Please type the filename of the protein sequence data: ";
$proteinfilename = <STDIN>; 
# Remove the newline from the protein filename 
chomp $proteinfilename; 
# open the file,or exit 
unless ( open(PROTEINFILE,$proteinfilename) ) { 
print "Cannot open file "$proteinfilename"nn"; 
exit; 
} 
# Read the protein sequence data from the file,and store it 
# into the array variable @protein 
@protein = <PROTEINFILE>; 
# Close the file - we've read all the data into @protein now. 
close PROTEINFILE; 
# Put the protein sequence data into a single string,as it's easier 
# to search for a motif in a string than in an array of 
# lines (what if the motif occurs over a line break?) 
$protein = join( '',@protein); 
# Remove whitespace 
$protein =~ s/s//g; 
# In a loop,ask the user for a motif,search for the motif,# and report if it was found. 
# Exit if no motif is entered. 
do { 
print "Enter a motif to search for: "; 
$motif = <STDIN>; 
# Remove the newline at the end of $motif 
chomp $motif; 
# Look for the motif 
if ( $protein =~ /$motif/ ) { 
print "I found it!nn"; 
} else { 
print "I couldn't find it.nn"; 
}
# exit on an empty user input 
} until ( $motif =~ /^s*$/ ); 
# exit the program 
exit; 

#!/usr/bin/perl -w 
# Determining frequency of nucleotides,take 2 
# Get the DNA sequence data 
print "Please type the filename of the DNA sequence data: "; 
$dna_filename = <STDIN>; 
chomp $dna_filename; 
# Does the file exist? 
unless ( -e $dna_filename) { 
print "File "$dna_filename" doesn't seem to exist!!n"; 
exit; 
} 
# Can we open the file? 
unless ( open(DNAFILE,$dna_filename) ) { 
print "Cannot open file "$dna_filename"nn"; 
exit; 
} 
@DNA = <DNAFILE>; 
close DNAFILE; 
$DNA = join( '',@DNA); 
# Remove whitespace 
$DNA =~ s/s//g; 
# Initialize the counts. 
# Notice that we can use scalar variables to hold numbers. 
$count_of_A = 0; 
$count_of_C = 0; 
$count_of_G = 0; 
$count_of_T = 0; 
$errors     = 0; 
# In a loop,look at each base in turn,determine which of the 
# four types of nucleotides it is,and increment the 
# appropriate count. 
for ( $position = 0 ; $position < length $DNA ; ++$position ) { 
$base = substr($DNA,$position,1);
if     ( $base eq 'A' ) { 
++$count_of_A; 
} elsif ( $base eq 'C' ) { 
++$count_of_C; 
} elsif ( $base eq 'G' ) { 
++$count_of_G; 
} elsif ( $base eq 'T' ) { 
++$count_of_T; 
} else { 
print "!!!!!!!! Error - I don't recognize this base: 
$basen"; 
++$errors; 
} 
} 
# print the results 
print "A = $count_of_An"; 
print "C = $count_of_Cn"; 
print "G = $count_of_Gn"; 
print "T = $count_of_Tn"; 
print "errors = $errorsn"; 
# exit the program 
exit; 
Here's the output of Example 5-6: 
Please type the filename of the DNA sequence data: small.dna 
!!!!!!!! Error - I don't recognize this vase: V 
A = 40 
C = 27 
G = 24 
T = 17 
errors = 1 



6 Subroutines and Bugs
7 Mutations and Randomization
8 The Genetic Code
9 Restriction Maps and Regular Expressions

10 GenBank
Extract annotation and sequence from GenBank file 
#!/usr/bin/perl 
# Extract annotation and sequence from GenBank file 
use strict; 
use warnings; 
use BeginPerlBioinfo;     # see Chapter 6 about this module 
# declare and initialize variables 
my @annotation = (  ); 
my $sequence = ''; 
my $filename = 'record.gb'; 
parse1(@annotation,$sequence,$filename); 
# Print the annotation,and then 
#   print the DNA in new format just to check if we got it okay. 
print @annotation; 
print_sequence($sequence,50); 
exit; 
#######################
sub parse1 { 
my($annotation,$dna,$filename) = @_; 
# $annotation--reference to array 
# $dna 
--reference to scalar 
# $filename  --scalar 
# declare and initialize variables 
my $in_sequence = 0; 
my @GenBankFile = (  ); 
# Get the GenBank data into an array from a file 
@GenBankFile = get_file_data($filename); 
# Extract all the sequence lines 
foreach my $line (@GenBankFile) { 
if( $line =~ /^//n/ ) { # If $line is end-of-record line 
//n,last; #break out of the foreach loop. 
} elsif( $in_sequence) { # If we know we're in a sequence,$$dna .= $line; # add the current line to $$dna. 
} elsif ( $line =~ /^ORIGIN/ ) { # If $line begins a sequence,$in_sequence = 1; # set the $in_sequence flag. 
} else{ # Otherwise 
push( @$annotation,$line); # add the current line to 
@annotation. 
} 
} 
# remove whitespace and line numbers from DNA sequence 
$$dna =~ s/[s0-9]//g; 
} 

Extract annotation and sequence from Genbank record 
#!/usr/bin/perl 
# Extract the annotation and sequence sections from the first 
#   record of a GenBank library 
use strict; 
use warnings; 
use BeginPerlBioinfo;
# Declare and initialize variables 
my $annotation = ''; 
my $dna = ''; 
my $record = ''; 
my $filename = 'record.gb'; 
my $save_input_separator = $/; 
# Open GenBank library file 
unless (open(GBFILE,$filename)) { 
print "Cannot open GenBank file "$filename"nn"; 
exit; 
} 
# Set input separator to "//n" and read in a record to a scalar 
$/ = "//n"; 
$record = <GBFILE>; 
# reset input separator 
$/ = $save_input_separator; 
# Now separate the annotation from the sequence data 
($annotation,$dna) = ($record =~ /^(LOCUS.*ORIGINs*n)(.*)//n/s); 
# Print the two pieces,which should give us the same as the 
#  original GenBank file,minus the // at the end 
print $annotation,$dna; 
exit;

Parsing GenBank annotations using arrays 
#!/usr/bin/perl 
# Parsing GenBank annotations using arrays 
use strict; 
use warnings; 
use BeginPerlBioinfo;     # see Chapter 6 about this module 
# Declare and initialize variables 
my @genbank = (  ); 
my $locus = ''; 
my $accession = ''; 
my $organism = ''; 
# Get GenBank file data 
@genbank = get_file_data('record.gb');

# Let's start with something simple.  Let's get some of the 
identifying 
# information,let's say the locus and accession number (here the 
same 
# thing) and the definition and the organism. 
for my $line (@genbank) { 
if($line =~ /^LOCUS/) { 
$line =~ s/^LOCUSs*//; 
$locus = $line; 
}elsif($line =~ /^ACCESSION/) { 
$line =~ s/^ACCESSIONs*//; 
$accession = $line; 
}elsif($line =~ /^  ORGANISM/) { 
$line =~ s/^s*ORGANISMs*//; 
$organism = $line; 
} 
} 
print "*** LOCUS ***n"; 
print $locus; 
print "*** ACCESSION ***n"; 
print $accession; 
print "*** ORGANISM ***n"; 
print $organism; 
exit; 


Listing the contents of a folder (or directory) 
#!/usr/bin/perl 
#   Demonstrating how to open a folder and list its contents 
use strict; 
use warnings; 
use BeginPerlBioinfo;     # see Chapter 6 about this module 
my @files = (  ); 
my $folder = 'pdb'; 
# open the folder 
unless(opendir(FOLDER,$folder)) { 
print "Cannot open folder $folder!n"; 
exit; 
} 
# read the contents of the folder (i.e. the files and subfolders) 
@files = readdir(FOLDER); 
# close the folder 
closedir(FOLDER); 
# print them out,one per line 
print join( "n",@files),"n"; 
exit; 


Extract sequence chains from PDB file 
#!/usr/bin/perl 
#  Extract sequence chains from PDB file 
use strict; 
use warnings; 
use BeginPerlBioinfo;

# Read in PDB file:  Warning - some files are very large! 
my @file = get_file_data('pdb/c1/pdb1c1f.ent'); 
# Parse the record types of the PDB file 
my %recordtypes = parsePDBrecordtypes(@file); 
# Extract the amino acid sequences of all chains in the protein 
my @chains = extractSEQRES( $recordtypes{'SEQRES'} ); 
# Translate the 3-character codes to 1-character codes,and print 
foreach my $chain (@chains) { 
print "****chain $chain **** n";

print "$chainn"; 
print iub3to1($chain),"n"; 
} 
exit;


# parsePDBrecordtypes 
# 
#--given an array of a PDB file,return a hash with 
#    keys   = record type names 
#    values = scalar containing lines for that record type 
sub parsePDBrecordtypes { 
my @file = @_; 
use strict; 
use warnings; 
my %recordtypes = (  ); 
foreach my $line (@file) { 
# Get the record type name which begins at the 
# start of the line and ends at the first space 
my($recordtype) = ($line =~ /^(S+)/); 
# .= fails if a key is undefined,so we have to 
# test for definition and use either .= or = depending 
if(defined $recordtypes{$recordtype} ) { 
$recordtypes{$recordtype} .= $line; 
}else{ 
$recordtypes{$recordtype} = $line; 
} 
} 
return %recordtypes; 
} 
# extractSEQRES 
# 
#--given an scalar containing SEQRES lines,#    return an array containing the chains of the sequence


sub extractSEQRES { 
use strict; 
use warnings; 
my($seqres) = @_; 
my $lastchain = ''; 
my $sequence = ''; 
my @results = (  ); 
# make array of lines 
my @record = split ( /n/,$seqres); 
foreach my $line (@record) { 
# Chain is in column 12,residues start in column 20 
my ($thischain) = substr($line,11,1); 
my($residues)  = substr($line,19,52); # add space at end 
# Check if a new chain,or continuation of previous chain 
if("$lastchain" eq "") { 
$sequence = $residues; 
}elsif("$thischain" eq "$lastchain") { 
$sequence .= $residues; 
# Finish gathering previous chain (unless first record) 
}elsif ( $sequence ) { 
push(@results,$sequence); 
$sequence = $residues; 
} 
$lastchain = $thischain; 
} 
# save last chain 
push(@results,$sequence); 
return @results; 
} 
# iub3to1 
# 
#--change string of 3-character IUB amino acid codes (whitespace 
separated) 
#    into a string of 1-character amino acid codes 
sub iub3to1 { 
my($input) = @_;

my %three2one = ( 
'ALA' => 'A','VAL' => 'V','LEU' => 'L','ILE' => 'I','PRO' => 'P','TRP' => 'W','PHE' => 'F','MET' => 'M','GLY' => 'G','SER' => 'S','THR' => 'T','TYR' => 'Y','CYS' => 'C','ASN' => 'N','GLN' => 'Q','LYS' => 'K','ARG' => 'R','HIS' => 'H','ASP' => 'D','GLU' => 'E',); 
# clean up the input 
$input =~ s/n/ /g; 
my $seq = ''; 
# This use of split separates on any contiguous whitespace 
my @code3 = split(' ',$input); 
foreach my $code (@code3) { 
# A little error checking 
if(not defined $three2one{$code}) { 
print "Code $code not definedn"; 
next; 
} 
$seq .= $three2one{$code}; 
} 
return $seq; 
} 


12 BLAST
Extract annotation and alignments from BLAST output file 
#!/usr/bin/perl 
# Extract annotation and alignments from BLAST output file 
use strict; 
use warnings; 
use BeginPerlBioinfo;     
# declare and initialize variables
my $beginning_annotation = ''; 
my $ending_annotation = ''; 
my %alignments = (  ); 
my $filename = 'blast.txt'; 
parse_blast($beginning_annotation,$ending_annotation,%alignments,and then 
#   print the DNA in new format just to check if we got it okay. 
print $beginning_annotation; 
foreach my $key (keys %alignments) { 
print "$keynXXXXXXXXXXXXn",$alignments{$key},"nXXXXXXXXXXXn"; 
} 
print $ending_annotation; 
exit; 

# parse_blast 
# 
# --parse beginning and ending annotation,and alignments,#     from BLAST output file 
sub parse_blast { 
my($beginning_annotation,$ending_annotation,$alignments,$filename) = @_; 
# $beginning_annotation--reference to scalar 
# $ending_annotation  --reference to scalar 
# $alignments 
--reference to hash 
# $filename 
--scalar 
# declare and initialize variables 
my $blast_output_file = ''; 
my $alignment_section = ''; 
# Get the BLAST program output into an array from a file 
$blast_output_file = join( '',get_file_data($filename));

# Extract the beginning annotation,alignments,and ending 
annotation 
($$beginning_annotation,$alignment_section,$$ending_annotation) 
= ($blast_output_file =~ /(.*^ALIGNMENTSn)(.*)(^ 
Database:.*)/ms); 
# Populate %alignments hash 
# key = ID of hit 
# value = alignment section 
%$alignments = parse_blast_alignment($alignment_section); 
} 
# parse_blast_alignment 
# 
# --parse the alignments from a BLAST output file,#       return hash with 
#       key = ID 
#       value = text of alignment 
sub parse_blast_alignment { 
my($alignment_section) = @_; 
# declare and initialize variables 
my(%alignment_hash) = (  ); 
# loop through the scalar containing the BLAST alignments,# extracting the ID and the alignment and storing in a hash 
# 
# The regular expression matches a line beginning with >,# and containing the ID between the first pair of | characters; 
# followed by any number of lines that don't begin with > 
while($alignment_section =~ /^>.*n(^(?!>).*n)+/gm) { 
my($value) = $&; 
my($key) = (split(/|/,$value)) [1]; 
$alignment_hash{$key} = $value; 
} 
return %alignment_hash; 
}


13 Further Topics

（编辑：李大同）

【声明】本站内容均来自网络，其相关言论仅代表作者个人观点，不代表本站立场。若无意侵犯到您的权利，请及时与联系站长删除相关内容!