📄 reformat-seq.sh
字号:
#!/bin/sh# reformat-seq.sh# Input: Sequence file $1# FILE_PREFIX: The sequence file name is stripped of .seq, .fasta, etc.# Output file is $FILE_PREFIX-local.seq# Cases:# IG (Intelligenetics) is left as is an the output file is just a# symbolic link to the input file.# FASTA is left as is but the header line is trucated at 80 characters# and the sequence itself has all white space removed and is wrapped# to 75 nt per record. # Both GenBank and EMBL are parsed properly. White spaces are# compressed to single spaces. The output is is FASTA format.# GCG is no longer accepted.# If the file does not appear to be in IG, FASTA, GB or EMBL format,# then it is treated as raw sequence data. The internal name for the# sequence is $FILE_PREFIX and the output is in FASTA format.if [ $# -lt 1 ] ; then echo -e " *** Usage: reformat-seq.sh in_fil,\nwhere in_fil is the name of the input sequence file." exit 1elif [ ! -s $1 ] ; then echo -e "The file, $1, does not exist or is empty" exit 2fiFILE_PREFIX=`echo $1|sed 's/.seq//'|sed 's/.gb//'|sed 's/.embl//'|sed \ 's/.fasta//'|sed 's/.SEQ//'|sed 's/.GB//'|sed 's/.EMBL//'|sed 's/.FASTA//'`#Test for GenBankGB=`grep -n "^LOCUS " $1`if [ $? = 0 ] ; then NAME_INDEX=`echo $GB|cut -d: -f1` NAME=`tail +$NAME_INDEX $1|head -1|tr -s " " " "|sed 's/^ //'|cut -c1-72` START=`grep -n "^ORIGIN " $1` if [ $? = 0 ] ; then START=`echo $START|cut -d: -f1` START=`expr $START + 1` else echo -e "Corrupted GenBank file. No ORIGIN line found." exit 3 fi STOP=`grep -n "^//" $1` if [ ! $? = 0 ] ; then STOP=`wc $i|tr -s " " " "|sed 's/^ //'` else STOP=`echo $STOP|cut -d: -f1` fi echo -e ">$NAME" > ${FILE_PREFIX}-local.seq STOP=`expr $STOP - 1` NLINES=`expr $STOP - $START + 1`# tail +$START $1|tr -cd "\012 ,A-z"|tr "a-z" "A-Z"|tr -s " " " "|\# head -$NLINES >> ${FILE_PREFIX}-local.seq tail +$START $1|tr -cd "\012 ,A-z"|tr -s " " " "|head -$NLINES \ >> ${FILE_PREFIX}-local.seq echo -e "$FILE_PREFIX" exit 0fi#Test for EMBLEMBL=`grep -n "^ID " $1`if [ $? = 0 ] ; then NAME_INDEX=`echo $EMBL|cut -d: -f1` NAME=`tail +$NAME_INDEX $1|head -1|tr -s " " " "|sed 's/^ //'|cut -c1-72` START=`grep -n "^SQ " $1` if [ $? = 0 ] ; then START=`echo $START|cut -d: -f1` START=`expr $START + 1` else echo -e "Corrupted EMBL file. No SQ line found." exit 3 fi STOP=`grep -n "^//" $1` if [ ! $? = 0 ] ; then STOP=`wc $i|tr -s " " " "|sed 's/^ //'` else STOP=`echo $STOP|cut -d: -f1` fi echo -e ">$NAME" > ${FILE_PREFIX}-local.seq STOP=`expr $STOP - 1` NLINES=`expr $STOP - $START + 1`# tail +$START $1|tr -cd "\012 ,A-z"|tr "a-z" "A-Z"|tr -s " " " "|\# head -$NLINES >> ${FILE_PREFIX}-local.seq tail +$START $1|tr -cd "\012 ,A-z"|tr -s " " " "|head -$NLINES \ >> ${FILE_PREFIX}-local.seq echo -e "$FILE_PREFIX" exit 0fi#Test for FASTAFASTA=`head -1 $1|grep "^>"`if [ $? = 0 ] ; then NAME=`echo $FASTA|cut -c2-80` echo -e ">$NAME" > ${FILE_PREFIX}-local.seq# tail +2 $1|tr -cd "A-z"|tr "a-z" "A-Z"|fold -w 75\# >> ${FILE_PREFIX}-local.seq tail +2 $1|tr -cd "A-z"|fold -w 75 >> ${FILE_PREFIX}-local.seq echo '' >> ${FILE_PREFIX}-local.seq echo -e "$FILE_PREFIX" exit 0fi#Test for IGIG=`head -1 $1|grep "^;"`if [ $? = 0 ] ; then rm -f ${FILE_PREFIX}-local.seq ln -s $1 ${FILE_PREFIX}-local.seq echo -e "$FILE_PREFIX" exit 0fi#At this point, assume raw sequence data and hope for the bestecho -e ">$FILE_PREFIX" > ${FILE_PREFIX}-local.seq#cat $1|tr -cd "A-z"|tr "a-z" "A-Z"|fold -w 75 >> ${FILE_PREFIX}-local.seqcat $1|tr -cd "A-z"|fold -w 75 >> ${FILE_PREFIX}-local.seqecho '' >> ${FILE_PREFIX}-local.seqecho -e "$FILE_PREFIX"exit 0
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -