#!/usr/bin/perl -w #################### # Arpy Saunders, 10/03/06 # # repeat_extractor inputs a FASTA file and pulls out those sequences which fall # within a length-range defined by the user. These variables are called the # $lowerlimit and the $upper limit. This program also analyzes the A content # and the AT content of the selected repeats. This analysis should establish # whether A content is diagnostic of the repeat orientation. If AT content is # equal, and A content is bimodal, repeats of one A percent can be reverse # complemented so all repeats can be appropriately aligned. use strict; use warnings; ################ Declare and initialize variables################# my $filename = 'crf_repeats_out_fasta.txt'; # name of input, dropped reads file my $outputfile = 'repeatextractor_out.txt'; my $lowerlimit = 720; my $upperlimit = 732; my @all_reads = ''; my $record; my $element; my $read_id; my @percent_a = ''; my @percent_at = ''; #############################input################################## # opens the fasta file containing the dropped reads and splits the single scalar # into an array, where each element is an individual read. split is done by the # '>' marker unless (open(FASTA, $filename)) { print "cannot open FASTA file \"$filename\"\n\n"; exit; } # set input separator to "//\n" and read in a record to a scalar $/ = "//\n"; # read the data into the scalar called "$record" $record = ; #######################output(puts extracted repeats in FASTA ################ unless (open (REPEATS_OUT, ">>$outputfile")) { print "cannot open file \"$outputfile\" to write to!\n\n"; exit; } ################################################################# #split the scalar "record" into an array, delimited by the "<" @all_reads = split />/, $record; # go through each element foreach $element (@all_reads) { my $basecount = 0; my $basecount2 = 0; my $basecount_a = 0; my $basecount_t = 0; my $percent_a = 0; my $percent_t = 0; my $percent_at = 0; my $dna = 0; # seperates dna from header ($read_id, $dna) = split /^.........../, $element; # counts the bases of repeat (basecount) and total bases (basecount2) # recall, 20 bases are added onto the exact repeat. $basecount = ($dna =~ tr/ACGTacgt//); $basecount2 = ($basecount - 20); # false reads are skipped if ($basecount == 0) {next; } # these if statements define those elements which qualify for the # user specified repeat length range if ($basecount2 > $lowerlimit) { if ($basecount2 < $upperlimit) { # prints the extracted repeats to FASTA print REPEATS_OUT ">$element\n"; # calculates the percent a $basecount_a = ($dna =~ tr/Aa//); $basecount_t = ($dna =~ tr/Tt//); $percent_a = (($basecount_a/$basecount) * 100); $percent_t = (($basecount_t/$basecount) * 100); $percent_at = ($percent_a + $percent_t); push (@percent_a, $percent_a); push (@percent_at,$percent_at); } } } count_unique (@percent_a); count_unique2 (@percent_at); #getting a hash list of numbers and thereby removing duplicates ######################################################################## # # Subroutines # ######################################################################## ###################### # count_unique # # this subroutine was taken from the faq section of a site on the internet. # It functions to count the unique elements of an array and store them as a hash. # This subroutine also functions to print the output file in the appropriate tab # delimited format. sub count_unique { my @array = @_; my %count; my $hash; my %hash = (); my $number_total_reads; my $array; $number_total_reads = $#array; map { $count{$_}++ } @array; my $outputfile2 = 'repeat_percenta_out.txt'; unless (open (REPEATCOUNT_OUT, ">$outputfile2")) { print "cannot open file \"$outputfile\" to write to!\n\n"; exit; } #print them out: print REPEATCOUNT_OUT "percent_a\t#_in_sample\ttotal_repeats=$number_total_reads"; map {print REPEATCOUNT_OUT "$_\t${count{$_}}\n"} sort keys(%count); #or just return the hash: return %count; } ######################## sub count_unique2 { my @array = @_; my %count; my $hash; my %hash = (); my $number_total_reads; my $array; $number_total_reads = $#array; map { $count{$_}++ } @array; my $outputfile2 = 'repeat_percentat_out.txt'; unless (open (REPEATCOUNT_OUT, ">$outputfile2")) { print "cannot open file \"$outputfile\" to write to!\n\n"; exit; } #print them out: print REPEATCOUNT_OUT "percent_at\t#_in_sample\ttotal_repeats=$number_total_reads"; map {print REPEATCOUNT_OUT "$_\t${count{$_}}\n"} sort keys(%count); #or just return the hash: return %count; }