#!/usr/bin/perl -w #################### # Arpy Saunders, 10/4/06 # # read_doubler revcom duplicates the dna found in a single read and formats # the resulting sequence in fasta format. The revcom of the doubled sequence # is then listed after the original doubled sequence. I checked this program on # 11/21/06 to make sure that when fed the original fastas (which have the 20 bp # repeat bookends) that these are trimmed off before the sequences are printed as # original and reverse complemented. # # use strict; use warnings; ################ Declare and initialize variables################# my $filename = 'repeatextractor_out.txt'; # name of input, dropped reads file my @all_reads = ''; my $record; my $read_id; my $read_id2; my $junk; # this variable holds useless stuff from a split my $element; my $dna; my $dna_length; my $crap; # this variable holds useless stuff from a split my $current_window2; my $query_window2; my @exact_match = ''; my @reverse_match = ''; my @revcomp_match = ''; ################################################################## # opens the fasta file containing the dropped reads and splits the single scalar # into an array, where each element is an individual read. split is done by the # '>' marker unless (open(FASTA, $filename)) { print "cannot open FASTA file \"$filename\"\n\n"; exit; } # set input separator to "//\n" and read in a record to a scalar $/ = "//\n"; # read the data into the scalar called "$record" $record = ; #split the scalar "record" into an array, delimited by the "<" @all_reads = split />/, $record; ################################################################### ### indicates print statements which produce an ongoing graphic as the script runs # this loop specifies actions to an individual element (read) in the array foreach $element (@all_reads) # takes on one read at a time { if ($element !~ /[A-Z]/) { next; } # defining the length of the read ($read_id, $crap) = split /(^<.*)|(\s)/s, $element; # splits the read # id ### print "READ_ID:\n$read_id\n\n"; ($junk, $dna) = split /\./s, $element; # pulls out the dna $dna =~ tr/xy12/ /; $dna =~ s/\n//g; # takes out newlines; $dna =~ s/\s//g; $dna_length = length $dna; # print "\n\n\noriginal length: $dna_length\n\n\n"; my $dna_length2 = ($dna_length - 20); # print "\n\n\nsubstring length: $dna_length2\n\n\n"; my $dna2 = substr ($dna, 0, $dna_length2); # print "\n\n\n$dna2\n\n\n"; my $dna_double = $dna2.$dna2; # print "\n\n\n$dna_double\n\n\n"; print_sequence ($dna_double, 50, $read_id); my $revcom = revcom ($dna_double); print_sequence2 ($revcom, 50, $read_id); } ############################ # # Subroutines # ############################ # revcom # # A subroutine to compute the reverse complement of DNA sequence sub revcom { my($dna) = @_; # First reverse the sequence my $revcom = reverse $dna; # Next, complement the sequence, dealing with upper and lower case # A->T, T->A, C->G, G->C $revcom =~ tr/ACGTacgt/TGCAtgca/; return $revcom; } ################################################### # print_sequence # # A subroutine to format and print sequence data sub print_sequence { my($sequence, $length, $read_id) = @_; use strict; use warnings; my $outputfile = 'readdoublerrevcom_output.txt'; unless (open (READDOUBLER_OUT, ">>$outputfile")) { print "cannot open file \"$outputfile\" to write to!\n\n"; exit; } print READDOUBLER_OUT ">$read_id\n"; # Print sequence in lines of $length for ( my $pos = 0 ; $pos < length($sequence) ; $pos += $length ) { print READDOUBLER_OUT substr($sequence, $pos, $length), "\n"; } } sub print_sequence2 { my($sequence, $length, $read_id) = @_; use strict; use warnings; my $outputfile = 'readdoublerrevcom_output.txt'; unless (open (READDOUBLER_OUT, ">>$outputfile")) { print "cannot open file \"$outputfile\" to write to!\n\n"; exit; } print READDOUBLER_OUT ">$read_id rc\n"; # Print sequence in lines of $length for ( my $pos = 0 ; $pos < length($sequence) ; $pos += $length ) { print READDOUBLER_OUT substr($sequence, $pos, $length), "\n"; } }