#!/usr/bin/perl -w #################### # Arpy Saunders, 3/4/07 # # scaffold_translator translates each independent fasta sequence into all 6 # frames. Each is then listed after eachother in an output file (+1, +2, +3, # -1, -2, -3); for every one fasta sequence input, there is thus 6 output. # # use strict; use warnings; ################ Declare and initialize variables################# my $filename = 'scaffold_translator_sample_input.txt'; # name of input, dropped reads file my $outputfile = 'scaffold_translator_sample_output.txt'; my @all_reads = ''; my $record; my $readid; my $crap; my $element; my $junk; my $junk2; my $dna_fasta; my $carrot = ">"; my $revcom_dna_fasta; my $dna; my $protein; ################################################################## # opens the fasta file containing the dropped reads and splits the single scalar # into an array, where each element is an individual read. split is done by the # '>' marker unless (open(FASTA, $filename)) { print "cannot open FASTA file \"$filename\"\n\n"; exit; } # prepares a file handle for the outputfile unless (open (TRANSLATOROUT, ">>$outputfile")) { print "cannot open file \"$outputfile\" to write to!\n\n"; exit; } # set input separator to "//\n" and read in a record to a scalar $/ = "//\n"; # read the data into the scalar called "$record" $record = ; #split the scalar "record" into an array, delimited by the "<" @all_reads = split />/, $record; ####################################################################### foreach $element (@all_reads) # takes on one read at a time { #add the carrot back onto the $element variable # $element = $carrot.$element; if ($element !~ /[A-Z]/) { next; } #print "\n\n$element\n\n"; ($junk, $dna_fasta) = split /scaffold_.*\n/, $element; ($readid, $junk2) = split /$dna_fasta/, $element; #remove newlines and spaces from readid $readid =~ s/\n//g; $readid =~ s/\s//g; #tacks the > symbol back on to the read id $readid = $carrot.$readid; #remove newlines and spaces from dna $dna_fasta =~ s/\n//g; $dna_fasta =~ s/\s//g; ###print $element; ###print "\nreadid: \n$readid"; ###print "\ndna:$dna_fasta\n"; #translate the sequence into all 6 reading frames ######## + frames first #+1 print TRANSLATOROUT $readid; print TRANSLATOROUT "_+1\n"; $protein = translate_frame ($dna_fasta,1); print_sequence($protein, 60); #+2 print TRANSLATOROUT $readid; print TRANSLATOROUT "_+2\n"; $protein = translate_frame ($dna_fasta,2); print_sequence($protein, 60); #+3 print TRANSLATOROUT $readid; print TRANSLATOROUT "_+3\n"; $protein = translate_frame ($dna_fasta,3); print_sequence($protein, 60); #calculate reverse complement $revcom_dna_fasta = revcom ($dna_fasta); ######## - frames second #-1 print TRANSLATOROUT $readid; print TRANSLATOROUT "_-1\n"; $protein = translate_frame ($revcom_dna_fasta,1); print_sequence($protein, 60); #-2 print TRANSLATOROUT $readid; print TRANSLATOROUT "_-2\n"; $protein = translate_frame ($revcom_dna_fasta,2); print_sequence($protein, 60); #-3 print TRANSLATOROUT $readid; print TRANSLATOROUT "_-3\n"; $protein = translate_frame ($revcom_dna_fasta,3); print_sequence($protein, 60); #print an extra line print TRANSLATOROUT "\n"; } ############# # # Subroutines # ############# #revcom # # a subroutine to compute the reverse complement of DNA sequence sub revcom { my ($dna) = @_; #first reverse the sequence my $revcom = reverse $dna; #next, complement the sequence, dealing with upper and lower cases $revcom =~ tr/ACGTacgt/TGCAtgca/; return $revcom; } #translate_frame # # A subroutine to translate a frame of DNA sub translate_frame { my ($seq, $start, $end) = @_; my $protein; # to make the subroutine easier to use, you won't need to specify the end # point - it will just go to the end of the sequence by default. unless ($end) { $end = length ($seq); } # finally, calculate and return the translation return dna2peptide (substr ($seq, $start -1, $end - $start + 1)); } # dna2peptide # # A subroutine to translate DNA sequence into a peptide sub dna2peptide { my($dna) = @_; use strict; use warnings; # Initialize variables my $protein = ''; # Translate each three-base codon to an amino acid, and append to a protein for(my $i=0; $i < (length($dna) - 2) ; $i += 3) { $protein .= codon2aa( substr($dna,$i,3) ); } return $protein; } # # codon2aa # # A subroutine to translate a DNA 3-character codon to an amino acid # Version 3, using hash lookup sub codon2aa { my($codon) = @_; $codon = uc $codon; my(%genetic_code) = ( 'TCA' => 'S', # Serine 'TCC' => 'S', # Serine 'TCG' => 'S', # Serine 'TCT' => 'S', # Serine 'TTC' => 'F', # Phenylalanine 'TTT' => 'F', # Phenylalanine 'TTA' => 'L', # Leucine 'TTG' => 'L', # Leucine 'TAC' => 'Y', # Tyrosine 'TAT' => 'Y', # Tyrosine 'TAA' => '*', # Stop 'TAG' => '*', # Stop 'TGC' => 'C', # Cysteine 'TGT' => 'C', # Cysteine 'TGA' => '*', # Stop 'TGG' => 'W', # Tryptophan 'CTA' => 'L', # Leucine 'CTC' => 'L', # Leucine 'CTG' => 'L', # Leucine 'CTT' => 'L', # Leucine 'CCA' => 'P', # Proline 'CCC' => 'P', # Proline 'CCG' => 'P', # Proline 'CCT' => 'P', # Proline 'CAC' => 'H', # Histidine 'CAT' => 'H', # Histidine 'CAA' => 'Q', # Glutamine 'CAG' => 'Q', # Glutamine 'CGA' => 'R', # Arginine 'CGC' => 'R', # Arginine 'CGG' => 'R', # Arginine 'CGT' => 'R', # Arginine 'ATA' => 'I', # Isoleucine 'ATC' => 'I', # Isoleucine 'ATT' => 'I', # Isoleucine 'ATG' => 'M', # Methionine 'ACA' => 'T', # Threonine 'ACC' => 'T', # Threonine 'ACG' => 'T', # Threonine 'ACT' => 'T', # Threonine 'AAC' => 'N', # Asparagine 'AAT' => 'N', # Asparagine 'AAA' => 'K', # Lysine 'AAG' => 'K', # Lysine 'AGC' => 'S', # Serine 'AGT' => 'S', # Serine 'AGA' => 'R', # Arginine 'AGG' => 'R', # Arginine 'GTA' => 'V', # Valine 'GTC' => 'V', # Valine 'GTG' => 'V', # Valine 'GTT' => 'V', # Valine 'GCA' => 'A', # Alanine 'GCC' => 'A', # Alanine 'GCG' => 'A', # Alanine 'GCT' => 'A', # Alanine 'GAC' => 'D', # Aspartic Acid 'GAT' => 'D', # Aspartic Acid 'GAA' => 'E', # Glutamic Acid 'GAG' => 'E', # Glutamic Acid 'GGA' => 'G', # Glycine 'GGC' => 'G', # Glycine 'GGG' => 'G', # Glycine 'GGT' => 'G', # Glycine ); if ($codon =~ /N/) { return "X"; } elsif(exists $genetic_code{$codon}) { return $genetic_code{$codon}; }else{ print STDERR "Bad codon \"$codon\"!!\n"; exit; } } # print_sequence # # A subroutine to format and print sequence data sub print_sequence { my($sequence, $length) = @_; use strict; use warnings; # Print sequence in lines of $length for ( my $pos = 0 ; $pos < length($sequence) ; $pos += $length ) { print TRANSLATOROUT substr($sequence, $pos, $length), "\n"; } }