📄 fasta-fetch
字号:
#!/usr/bin/perl$pgm = $0; # name of program$pgm =~ s#.*/##; # remove part up to last slash$status = 0; # exit status#$SIG{'INT'} = 'cleanup'; # interrupt handler$line_size = 50; # size of lines to print$off = 1; # first position of sequence to print$len = -1; # maximum length of sequence to print $usage = <<USAGE; # usage message USAGE: $pgm <fasta> [-f <file> | [<id>]+] [-c] [-s <suf>] [-off <off>] [-len <len>] <fasta> name of FASTA sequence file [-f <file>] file containing list of sequence identifiers [<id>] sequence identifier [-c] check that first word in fasta id matches <id> [-s <suf>] put each sequence in a file named "after" the sequence identifier with ".<suf>" appended; pipes in file names are changed to underscores [-off <off>] print starting at position <off>; default: $off [-len <len>] print up to <len> characters for each seq; default: print entire sequence Note: Assumes and index file has been made using fasta-make-index. Sequence identifiers must be same as made by fasta-make-index. Reads sequence identifiers from the command line, from a file and from standard input, in that order. Fetch sequences from a FASTA sequence file and write to standard output.USAGEif ($#ARGV+1 < 1) { # wrong number of arguments die $usage;}# get input arguments$fasta = shift; # file of FASTA sequenceswhile ($#ARGV >= 0) { $_ = shift; if ($_ eq "-f") { # from file $filename = shift; } elsif ($_ eq "-c") { # check name match $check = 1; } elsif ($_ eq "-s") { # put in files $suffix = shift; $print_to_files = 1; } elsif ($_ eq "-off") { # offset $off = shift; } elsif ($_ eq "-len") { # maximum length $len = shift; } else { # file name push @target, $_; }}# get targets from fileif ($filename) { open(IDFILE, "<$filename") || die("Can't open file $filename\n"); while (<IDFILE>) { next if (/^#/ || /^\s*$/); # skip comments an blank lines @words = split; push @target, @words; } close(IDFILE);}#while (<>) { # @words = split;# push @target, @words;#}# open the FASTA fileopen(FASTA, "<$fasta") || die "Can't open file $fasta\n";# open the index file$index = "$fasta.index";open(INDEX, "<$index") || die "Can't open $index\n";# read in the indexwhile (<INDEX>) { @words = split; $id = $words[0]; # sequence id $addr = $words[1]; # start of sequence $index{$id} = "$index{$id} $addr";}# use direct access to get each targetforeach $id (@target) { $_ = $index{">$id"}; # address of sequence if ($_ eq undef) { print STDERR "Can't find target $id\n"; next; # go to next target } # open output file if requested if ($print_to_files) { $outfile = $id; $outfile =~ s/\|/_/g; $outfile .= ".$suffix" if ($suffix); open(OUTPUT, ">$outfile") || die("Can't open file $outfile : $!\n"); } else { # print to files open(OUTPUT, ">-"); } foreach $addr (split) { # one-to-many seek(FASTA, $addr, 0); # move to start of target $_ = <FASTA>; $idline = $_; @words = split; if ($check && $words[0] ne ">$id") { $tmp = $words[0]; @words2 = split(/[>\|]/, $tmp); $tmp = $words2[$#words2]; if ($tmp ne $id) { $lower = $tmp; $lower =~ tr/A-Z/a-z/; if ($lower ne $id) { print STDERR "Bad index? target= >$id found= $words[0]\n"; next; # go to next target } } } print OUTPUT "$idline"; # print header $seq = ""; while (<FASTA>) { # read sequence lines if (/^>/) {last} # start of next sequence # print sequence line chop; $seq .= $_; } # print sequence in lines of length $line_size # get portion of sequence to print if -off and/or -len given if ($off != 1 || $len != -1) { if ($len == -1) { $seq = substr($seq, $off-1); } else { $seq = substr($seq, $off-1, $len); } } for ($i=0; $i<length($seq); $i+=$line_size) { print substr($seq, $i, $line_size), "\n"; } } # addr close(OUTPUT);} # id
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -