#!/usr/bin/perl -w #################### # Arpy Saunders, 10/4/06 # # read_doubler duplicates the dna found in a single read and formats # the resulting sequence in fasta format. Checked 11/9 --> this version works # to double reads! # # use strict; use warnings; ################ Declare and initialize variables################# my $filename = 'repeatextractor_out.txt'; # name of input, dropped reads file my @all_reads = ''; my $record; my $read_id; my $junk; # this variable holds useless stuff from a split my $element; my $dna; my $dna_length; my $crap; # this variable holds useless stuff from a split my $current_window2; my $query_window2; my @exact_match = ''; my @reverse_match = ''; my @revcomp_match = ''; ################################################################## # opens the fasta file containing the dropped reads and splits the single scalar # into an array, where each element is an individual read. split is done by the # '>' marker unless (open(FASTA, $filename)) { print "cannot open FASTA file \"$filename\"\n\n"; exit; } # set input separator to "//\n" and read in a record to a scalar $/ = "//\n"; # read the data into the scalar called "$record" $record = ; #split the scalar "record" into an array, delimited by the "<" @all_reads = split />/, $record; ################################################################### ### indicates print statements which produce an ongoing graphic as the script runs # this loop specifies actions to an individual element (read) in the array foreach $element (@all_reads) # takes on one read at a time { if ($element !~ /[A-Z]/) { next; } # defining the length of the read ($read_id, $crap) = split /(^<.*)|(\s)/s, $element; # splits the read # id ### print "READ_ID:\n$read_id\n\n"; ($junk, $dna) = split /\./s, $element; # pulls out the dna $dna =~ tr/xy12/ /; $dna =~ s/\n//g; # takes out newlines; $dna =~ s/\s//g; my $dna_double = $dna.$dna; print_sequence ($dna_double, 50, $read_id); } ################################################### # print_sequence # # A subroutine to format and print sequence data sub print_sequence { my($sequence, $length, $read_id) = @_; use strict; use warnings; my $outputfile = 'readdoubler_output.txt'; unless (open (READDOUBLER_OUT, ">>$outputfile")) { print "cannot open file \"$outputfile\" to write to!\n\n"; exit; } print READDOUBLER_OUT ">$read_id\n"; # Print sequence in lines of $length for ( my $pos = 0 ; $pos < length($sequence) ; $pos += $length ) { print READDOUBLER_OUT substr($sequence, $pos, $length), "\n"; } }