#!/usr/bin/perl -w ####################### # # Scaffold_Chopper chops up scaffolds into subsets of a user defined length. # Subsets with "Ns" are disgarded. # # # use strict; use warnings; # Define and initialize Variables #User Variables my $scaffold_input = 'scaffold_chopper_sample_input.txt'; my $subset_length = '400'; my $outputfile = 'scaffold_chopper_sample_output.txt'; #Non-User Variables my $scaffold_record; my @scaffolds = ''; my $scaffold; my $scaffold_id; my $dna; my $junk1; my $junk2; my $current_position; my $subset_counter; my $current_subset; # open input and output files # input unless (open(SCAFFOLD, $scaffold_input)) { print "cannot open FASTA total file \"$scaffold_input\"\n\n"; exit; } #output unless (open (SCAFFOLDSUBSETS, ">>$outputfile")) { print "cannot open file \"$outputfile\" to write to!\n\n"; exit; } # set input separator to "//\n" and read in a record to a scalar $/ = "//\n"; # read the data into the scalar called $scaffold_record $scaffold_record = ; # separate the file into individual scaffolds @scaffolds = split />/, $scaffold_record; foreach $scaffold (@scaffolds) { # ensures $scaffold element is actually contains DNA if ($scaffold !~ /ATCG/) { next; } # separate the dna and scaffold id ($junk1, $dna) = split /^.*\n/, $scaffold; ($scaffold_id, $junk2) = split /$dna/, $scaffold; # remove the whitespace and newlines from $dna $scaffold_id =~ s/\n//g; $scaffold_id =~ s/\s//g; $dna =~ s/\n//g; $dna =~ s/\s//g; # print $dna; #reset the subset_counter $subset_counter = 0; # pull out subsets of user defined length from the scaffold for ($current_position = 0; $current_position < length $dna; $current_position+= $subset_length) { $current_subset = substr($dna, $current_position, $subset_length); #skip over current subsets containing "Ns" if ($current_subset !~ /N/) { # print the scaffold_id with the subset counter $subset_counter++; print SCAFFOLDSUBSETS "\n",">","$scaffold_id","_","$subset_counter\n"; print_sequence ($current_subset, 60); } } } ##########Sub-Routines######################################################## # print_sequence (and print_sequence2) # # A subroutine to format and print sequence data sub print_sequence { my($sequence, $length) = @_; use strict; use warnings; # Print sequence in lines of $length for ( my $pos = 0 ; $pos < length($sequence) ; $pos += $length ) { print SCAFFOLDSUBSETS substr($sequence, $pos, $length), "\n"; } }