#!/usr/bin/perl -w #################### # Arpy Saunders, 9/27/06 # # cent_repeat_finder is a program that scans dropped sequencing reads from # a genome project to identify tandem repeats which are good candidates for # centromeric repeats use strict; use warnings; ################ Declare and initialize variables################# my $filename = 'crf_input.txt'; # name of input, dropped reads file my @all_reads = ''; my $record; my $read_id; my $junk; # this variable holds useless stuff from a split my $element; my $dna; my $crap; # this variable holds useless stuff from a split my $current_window2; my $query_window2; ################################################################## # opens the fasta file containing the dropped reads and splits the single scalar # into an array, where each element is an individual read. split is done by the # '>' marker unless (open(FASTA, $filename)) { print "cannot open FASTA file \"$filename\"\n\n"; exit; } # set input separator to "//\n" and read in a record to a scalar $/ = "//\n"; # read the data into the scalar called "$record" $record = ; #split the scalar "record" into an array, delimited by the "<" @all_reads = split />/, $record; ################################################################### ### indicates print statements which produce an ongoing graphic as the script runs # this loop specifies actions to an individual element (read) in the array SINGLEREAD: foreach $element (@all_reads) # takes on one read at a time { my $dna_length = length $dna; # defining the length of the read ($read_id, $crap) = split /(^<.*)|(\s)/s, $element; # splits the read # id ### print "READ_ID:\n$read_id\n\n"; ($junk, $dna) = split /2006/s, $element; # pulls out the dna $dna =~ s/\n//g; # takes out newlines; # dna is now one string ### print "\n\n$dna\n\n"; # setting the current and query window positions my $current_position = 0; my $current_window; $current_window = substr($dna, $current_position, 10); ### print "\n\n$current_window\n\n"; my $query_position = 121; my $query_window; $query_window = substr($dna, $query_position, 10); ### print "\n\n$query_window\n\n"; #for loop takes a single 10bp window starting at position 0 # Windows with non-nucleotide characters are skipped. for ($current_position = 0; $current_position < length $dna; $current_position+=11) { $current_window = substr($dna, $current_position, 10); if ($current_window =~ m/[^AaCcTtGg]/) { ### print "\nnon-nucleotide window\n"; next; } ### print "current window: \n$current_window\n"; # for each window, compare 10bp windows starting 120 bp away in a # "sliding window" format. If there's a match, see if the next 10bp # at either position also match for ($query_position = ($current_position + 120); ($query_position < ((length $dna) - 10)); $query_position++) { $query_window = substr($dna, $query_position, 10); ### print "\t query window: $query_window\n"; if ($current_window =~ m/$query_window/) { ### print "\t\t\t <------this query window matched at $query_position\n"; my $current_window2 = substr($dna, ($current_position + 10), 10); my $query_window2 = substr($dna, ($query_position + 10), 10); ### print "\n current window 2: $current_window2"; ### print "\n query window 2: $query_window2\n"; # if both sets of windows match, calculate the repeat length. if ($current_window2 =~ m/$query_window2/) { my $repeat; # the sequence between 20bp matchin windows my $repeat_length; # the distance between matching positions in current and query windows my $repeat_length_20; # repeat length + 20 bps, to pull out sequence with the same begining # and ending 20bps $repeat_length = ($query_position - $current_position); $repeat_length_20 = ($repeat_length + 20); $repeat = substr($dna, $current_position, $repeat_length_20); ### print "\n\n###################################\n\n current position = $current_position\n query position = $query_position\n my repeat length = $repeat_length\n repeat = $repeat \n\n############################\n"; my $outputfile = 'crf_repeats_out.txt'; unless (open (REPEATS_OUT, ">>$outputfile")) { print "cannot open file \"$outputfile\" to write to!\n\n"; exit; } print REPEATS_OUT "\nread_id:$read_id\n \n\tmatch_sequence: $current_window $current_window2 \n\trepeat_length: $repeat_length \n\tsequence:\n\t\t\t"; print_sequence ($repeat, 50); my $outputfile2 = 'crf_repeats_out_fasta.txt'; unless (open (REPEATS_OUT_FASTA, ">>$outputfile2")) { print "cannot open file \"$outputfile2\" to write to!\n\n"; exit; } print REPEATS_OUT_FASTA ">$read_id\n"; print_sequence2 ($repeat, 50); next SINGLEREAD; } } } } } ################################################################# # Subroutines (to format sequence printing) ################################################################# # print_sequence (and print_sequence2) # # A subroutine to format and print sequence data sub print_sequence { my($sequence, $length) = @_; use strict; use warnings; # Print sequence in lines of $length for ( my $pos = 0 ; $pos < length($sequence) ; $pos += $length ) { print REPEATS_OUT substr($sequence, $pos, $length), "\n\t\t\t"; } } sub print_sequence2 { my($sequence, $length) = @_; use strict; use warnings; # Print sequence in lines of $length for ( my $pos = 0 ; $pos < length($sequence) ; $pos += $length ) { print REPEATS_OUT_FASTA substr($sequence, $pos, $length), "\n"; } }