#!/usr/bin/perl -w #################### # Arpy Saunders, 9/27/06 # # this program extracts information about the number of repeats for a given # length from the output of the program cent_repeat_finder. The output file is # in tab-delimited format so it can be easily input into Excel. use strict; use warnings; ################ Declare and initialize variables################# my $filename = 'crf_repeats_out.txt'; # name of input, dropped reads file my $record; my @all_reads = ''; my $element; my $junk1; my $junk2; my @junk = ''; my $repeat_length; my @repeats = ''; # opens the fasta file containing the dropped reads and splits the single scalar # into an array, where each element is an individual read. split is done by the # '>' marker unless (open(FASTA, $filename)) { print "cannot open FASTA file \"$filename\"\n\n"; exit; } # set input separator to "//\n" and read in a record to a scalar $/ = "//\n"; # read the data into the scalar called "$record" $record = ; @all_reads = split /(read_id:)/, $record; foreach $element (@all_reads) # takes on one read at a time { if ($element =~ m/read_id/) { next; } #pull out scalar after sequence and then take out any newlines and spaces ($junk1) = split (/\tsequence:/s, $element); $junk1=~ s/\n//g; $junk1=~ s/\s//g; # split the resulting element by :; since we are only interested in the last # string, the first two are stored in the scalars called crap1 and crap2 (my $crap1, my $crap2, $repeat_length) = split (/:/s, $junk1); # add the repeat length to a list push (@repeats, $repeat_length); } # sort the repeats numerically my @sorted_repeats = sort {$a <=> $b} @repeats; count_unique (@sorted_repeats); #getting a hash list of numbers and thereby removing duplicates ######################################################################## # # Subroutines # ######################################################################## ###################### # count_unique # # this subroutine was taken from the faq section of a site on the internet. # It functions to count the unique elements of an array and store them as a hash. # This subroutine also functions to print the output file in the appropriate tab # delimited format. sub count_unique { my @array = @_; my %count; my $hash; my %hash = (); my $number_total_reads; my $array; $number_total_reads = $#array - 1; map { $count{$_}++ } @array; my $outputfile = 'repeatcount_out.txt'; unless (open (REPEATCOUNT_OUT, ">$outputfile")) { print "cannot open file \"$outputfile\" to write to!\n\n"; exit; } #print them out: print REPEATCOUNT_OUT "length_of_repeat\t#_in_sample\ttotal_repeats=$number_total_reads"; map {print REPEATCOUNT_OUT "$_\t${count{$_}}\n"} sort keys(%count); #or just return the hash: return %count; }