#!/usr/bin/perl -w #################### # Arpy Saunders, 10/30/06 # # parse_scaffolds is a script which reads through a fasta file of genome scaffolds # and returns only those scaffolds of interest # # the user specifies the name of the original fasta file and the scaffold numbers # in the "USER FIELDS" part of the script. The scaffolds of interest are printed # in the file called "scaffold_out" in fasta format. # use strict; use warnings; # Declare and initialize variables my $record = ''; my @all_scaffolds = (); my @subset_scaffolds = (); my $element = ''; my $element2 = ''; my $element_interest = ''; my $carrot= "\n>"; my $save_input_separator = $/; my $dna; my $junk; my $junk2; my $readid; ########################################## # USER FIELDS: # my filename = name of the fasta file containing scaffolds # my @scaffolds_of_interest = ("scaffold_XXX", "scaffold_YYY"); my @scaffolds_of_interest = ("scaffold_323"); my $filename = 'parse_scaffolds_sample_in.txt'; # Open GenBank Library file unless (open(GBFILE, $filename)) { print "cannot open GenBank file \"$filename\"\n\n"; exit; } # set input separator to "//\n" and read in a record to a scalar $/ = "//\n"; # read the data into the scalar called "$record" $record = ; #split the scalar "record" into an array, delimited by the ">" @all_scaffolds = split />/s, $record; # go through each fasta element of @all scaffolds and pull out those of interest foreach $element (@all_scaffolds) { #print "\n\n$element\n\n"; ($junk, $dna) = split /scaffold_.*\n/, $element; ($readid, $junk2) = split /$dna/, $element; $readid =~ s/\n//g; $readid =~ s/\s//g; #print "$readid\n"; #print "s.o.i: $scaffolds_of_interest"; foreach $element_interest (@scaffolds_of_interest) { if ($element_interest =~ m/^$readid$/) { #print "/n/ngot one!\n\n"; $element2 = $carrot.$element; push @subset_scaffolds, $element2; } } } # print on screen # print "Collected Scaffolds: @subset_scaffolds"; # print to 'scaffold_out' file my $outputfile = 'parse_scaffolds_sample_out.txt'; unless (open (SCAFFOLD_OUT, ">$outputfile")) { print "cannot open file \"$outputfile\" to write to!!!!\n\n"; exit; } print SCAFFOLD_OUT @subset_scaffolds; close (SCAFFOLD_OUT);