📄 blast_engine.c
字号:
/* * =========================================================================== * PRODUCTION $Log: blast_engine.c,v $ * PRODUCTION Revision 1000.6 2004/06/01 18:07:09 gouriano * PRODUCTION PRODUCTION: UPGRADED [GCC34_MSVC7] Dev-tree R1.135 * PRODUCTION * =========================================================================== *//* $Id: blast_engine.c,v 1000.6 2004/06/01 18:07:09 gouriano Exp $ * =========================================================================== * * PUBLIC DOMAIN NOTICE * National Center for Biotechnology Information * * This software/database is a "United States Government Work" under the * terms of the United States Copyright Act. It was written as part of * the author's offical duties as a United States Government employee and * thus cannot be copyrighted. This software/database is freely available * to the public for use. The National Library of Medicine and the U.S. * Government have not placed any restriction on its use or reproduction. * * Although all reasonable efforts have been taken to ensure the accuracy * and reliability of the software and data, the NLM and the U.S. * Government do not and cannot warrant the performance or results that * may be obtained by using this software or data. The NLM and the U.S. * Government disclaim all warranties, express or implied, including * warranties of performance, merchantability or fitness for any particular * purpose. * * Please cite the author in any work or product based on this material. * * =========================================================================== * * Author: Ilya Dondoshansky * *//** @file blast_engine.c * High level BLAST functions */static char const rcsid[] = "$Id: blast_engine.c,v 1000.6 2004/06/01 18:07:09 gouriano Exp $";#include <algo/blast/core/blast_engine.h>#include <algo/blast/core/lookup_wrap.h>#include <algo/blast/core/aa_ungapped.h>#include <algo/blast/core/blast_util.h>#include <algo/blast/core/blast_setup.h>#include <algo/blast/core/blast_gapalign.h>#include <algo/blast/core/blast_traceback.h>#include <algo/blast/core/phi_extend.h>#include <algo/blast/core/link_hsps.h>/** Deallocates all memory in BlastCoreAuxStruct */static BlastCoreAuxStruct* BlastCoreAuxStructFree(BlastCoreAuxStruct* aux_struct){ BlastExtendWordFree(aux_struct->ewp); BLAST_InitHitListFree(aux_struct->init_hitlist); Blast_HSPListFree(aux_struct->hsp_list); sfree(aux_struct->query_offsets); sfree(aux_struct->subject_offsets); sfree(aux_struct); return NULL;}/** Adjust HSP coordinates for out-of-frame gapped extension. * @param program One of blastx or tblastn [in] * @param init_hitlist List of hits after ungapped extension [in] * @param query_info Query information containing context offsets; * needed for blastx only [in] * @param subject_frame Frame of the subject sequence; tblastn only [in] * @param subject_length Length of the original nucleotide subject sequence; * tblastn only [in] * @param offset Shift in the subject sequence protein coordinates [in] */static void TranslateHSPsToDNAPCoord(Uint1 program, BlastInitHitList* init_hitlist, BlastQueryInfo* query_info, Int2 subject_frame, Int4 subject_length, Int4 offset){ BlastInitHSP* init_hsp; Int4 index, context, frame; Int4* context_offsets = query_info->context_offsets; for (index = 0; index < init_hitlist->total; ++index) { init_hsp = &init_hitlist->init_hsp_array[index]; if (program == blast_type_blastx) { context = BSearchInt4(init_hsp->q_off, context_offsets, query_info->last_context+1); frame = context % 3; init_hsp->q_off = (init_hsp->q_off - context_offsets[context]) * CODON_LENGTH + context_offsets[context-frame] + frame; init_hsp->ungapped_data->q_start = (init_hsp->ungapped_data->q_start - context_offsets[context]) * CODON_LENGTH + context_offsets[context-frame] + frame; } else { init_hsp->s_off += offset; init_hsp->ungapped_data->s_start += offset; if (subject_frame > 0) { init_hsp->s_off = (init_hsp->s_off * CODON_LENGTH) + subject_frame - 1; init_hsp->ungapped_data->s_start = (init_hsp->ungapped_data->s_start * CODON_LENGTH) + subject_frame - 1; } else { init_hsp->s_off = (init_hsp->s_off * CODON_LENGTH) + subject_length - subject_frame; init_hsp->ungapped_data->s_start = (init_hsp->ungapped_data->s_start * CODON_LENGTH) + subject_length - subject_frame; } } } return;}/** The core of the BLAST search: comparison between the (concatenated) * query against one subject sequence. Translation of the subject sequence * into 6 frames is done inside, if necessary. If subject sequence is * too long, it can be split into several chunks. */static Int2BLAST_SearchEngineCore(Uint1 program_number, BLAST_SequenceBlk* query, BlastQueryInfo* query_info, BLAST_SequenceBlk* subject, LookupTableWrap* lookup, BlastGapAlignStruct* gap_align, BlastScoringParameters* score_params, BlastInitialWordParameters* word_params, BlastExtensionParameters* ext_params, BlastHitSavingParameters* hit_params, const BlastDatabaseOptions* db_options, BlastDiagnostics* diagnostics, BlastCoreAuxStruct* aux_struct, BlastHSPList** hsp_list_out){ BlastInitHitList* init_hitlist = aux_struct->init_hitlist; BlastHSPList* hsp_list = aux_struct->hsp_list; Uint1* translation_buffer = NULL; Int4* frame_offsets = NULL; BlastHitSavingOptions* hit_options = hit_params->options; BlastScoringOptions* score_options = score_params->options; BlastHSPList* combined_hsp_list = NULL; Int2 status = 0; Int4 context, first_context, last_context; Int4 orig_length = subject->length; Uint1* orig_sequence = subject->sequence; Int4 **matrix; Int4 hsp_num_max; BlastUngappedStats* ungapped_stats = NULL; BlastGappedStats* gapped_stats = NULL; const Boolean k_translated_subject = (program_number == blast_type_tblastn || program_number == blast_type_tblastx || program_number == blast_type_rpstblastn); if (k_translated_subject) { first_context = 0; last_context = 5; if (score_options->is_ooframe) { BLAST_GetAllTranslations(orig_sequence, NCBI2NA_ENCODING, orig_length, db_options->gen_code_string, &translation_buffer, &frame_offsets, &subject->oof_sequence); subject->oof_sequence_allocated = TRUE; } else { BLAST_GetAllTranslations(orig_sequence, NCBI2NA_ENCODING, orig_length, db_options->gen_code_string, &translation_buffer, &frame_offsets, NULL); } } else if (program_number == blast_type_blastn) { first_context = 1; last_context = 1; } else { first_context = 0; last_context = 0; } *hsp_list_out = NULL; if (gap_align->positionBased) matrix = gap_align->sbp->posMatrix; else matrix = gap_align->sbp->matrix; hsp_num_max = (hit_options->hsp_num_max ? hit_options->hsp_num_max : INT4_MAX); if (diagnostics) { ungapped_stats = diagnostics->ungapped_stat; gapped_stats = diagnostics->gapped_stat; } /* Loop over frames of the subject sequence */ for (context=first_context; context<=last_context; context++) { Int4 chunk; /* loop variable below. */ Int4 num_chunks; /* loop variable below. */ Int4 offset = 0; /* Used as offset into subject sequence (if chunked) */ Int4 total_subject_length; /* Length of subject sequence used when split. */ if (k_translated_subject) { subject->frame = BLAST_ContextToFrame(blast_type_blastx, context); subject->sequence = translation_buffer + frame_offsets[context] + 1; subject->length = frame_offsets[context+1] - frame_offsets[context] - 1; } else { subject->frame = context; } /* Split subject sequence into chunks if it is too long */ num_chunks = (subject->length - DBSEQ_CHUNK_OVERLAP) / (MAX_DBSEQ_LEN - DBSEQ_CHUNK_OVERLAP) + 1; total_subject_length = subject->length; for (chunk = 0; chunk < num_chunks; ++chunk) { if (chunk > 0) { offset += subject->length - DBSEQ_CHUNK_OVERLAP; if (program_number == blast_type_blastn) { subject->sequence += (subject->length - DBSEQ_CHUNK_OVERLAP)/COMPRESSION_RATIO; } else { subject->sequence += (subject->length - DBSEQ_CHUNK_OVERLAP); } } subject->length = MIN(total_subject_length - offset, MAX_DBSEQ_LEN); BlastInitHitListReset(init_hitlist); aux_struct->WordFinder(subject, query, lookup, matrix, word_params, aux_struct->ewp, aux_struct->query_offsets, aux_struct->subject_offsets, GetOffsetArraySize(lookup), init_hitlist, ungapped_stats); if (init_hitlist->total == 0) continue; if (score_options->gapped_calculation) { Int4 prot_length = 0; if (score_options->is_ooframe) { /* Convert query offsets in all HSPs into the mixed-frame coordinates */ TranslateHSPsToDNAPCoord(program_number, init_hitlist, query_info, subject->frame, orig_length, offset); if (k_translated_subject) { prot_length = subject->length; subject->length = 2*orig_length + 1; } } /** NB: If queries are concatenated, HSP offsets must be adjusted * inside the following function call, so coordinates are * relative to the individual contexts (i.e. queries, strands or * frames). Contexts should also be filled in HSPs when they * are saved. */ aux_struct->GetGappedScore(program_number, query, query_info, subject, gap_align, score_params, ext_params, hit_params, init_hitlist, &hsp_list, gapped_stats); if (score_options->is_ooframe && k_translated_subject) subject->length = prot_length; } else { BLAST_GetUngappedHSPList(init_hitlist, query_info, subject, hit_options, &hsp_list); } if (hsp_list->hspcnt == 0) continue; /* The subject ordinal id is not yet filled in this HSP list */ hsp_list->oid = subject->oid; /* Assign frames in all HSPs. */ Blast_HSPListSetFrames(program_number, hsp_list, score_options->is_ooframe); Blast_HSPListAdjustOffsets(hsp_list, offset); /* Allow merging of HSPs either if traceback is already available, or if it is an ungapped search */ Blast_HSPListsMerge(hsp_list, &combined_hsp_list, hsp_num_max, offset, (Boolean) (hsp_list->traceback_done || !score_options->gapped_calculation)); } /* End loop on chunks of subject sequence */ Blast_HSPListAppend(combined_hsp_list, hsp_list_out, hsp_num_max); combined_hsp_list = Blast_HSPListFree(combined_hsp_list); } /* End loop on frames */ /* Restore the original contents of the subject block */ subject->length = orig_length; subject->sequence = orig_sequence; hsp_list = *hsp_list_out; if (hit_params->do_sum_stats == TRUE) { status = BLAST_LinkHsps(program_number, hsp_list, query_info, subject, gap_align->sbp, hit_params, score_options->gapped_calculation); } else if (hit_options->phi_align) { /* These e-values will not be accurate yet, since we don't know the number of pattern occurrencies in the database. That is arbitrarily set to 1 at this time. */ Blast_HSPListPHIGetEvalues(hsp_list, gap_align->sbp); } else { /* Calculate e-values for all HSPs. Skip this step for RPS blast, since calculating the E values requires precomputation that has not been done yet */ if (program_number != blast_type_rpsblast && program_number != blast_type_rpstblastn) status = Blast_HSPListGetEvalues(program_number, query_info, hsp_list, score_options->gapped_calculation, gap_align->sbp); } /* Discard HSPs that don't pass the e-value test */ status = Blast_HSPListReapByEvalue(hsp_list, hit_options); if (gapped_stats && hsp_list && hsp_list->hspcnt > 0) { ++gapped_stats->num_seqs_passed; gapped_stats->good_extensions += hsp_list->hspcnt; } if (translation_buffer) { sfree(translation_buffer); } if (frame_offsets) { sfree(frame_offsets); } return status;}static Int2 FillReturnCutoffsInfo(BlastRawCutoffs* return_cutoffs, BlastScoringParameters* score_params, BlastInitialWordParameters* word_params, BlastExtensionParameters* ext_params){ /* since the cutoff score here will be used for display putposes, strip out any internal scaling of the scores */ Int4 scale_factor = (Int4)score_params->scale_factor; if (!return_cutoffs) return -1; return_cutoffs->x_drop_ungapped = word_params->x_dropoff / scale_factor; return_cutoffs->x_drop_gap = ext_params->gap_x_dropoff / scale_factor; return_cutoffs->x_drop_gap_final = ext_params->gap_x_dropoff_final / scale_factor; return_cutoffs->gap_trigger = ext_params->gap_trigger / scale_factor; return 0;}/** Setup of the auxiliary BLAST structures; * also calculates internally used parameters from options. * @param program_number blastn, blastp, blastx, etc. [in] * @param seq_src Sequence source information, with callbacks to get * sequences, their lengths, etc. [in] * @param scoring_options options for scoring. [in] * @param eff_len_options used to calculate effective lengths. [in] * @param lookup_wrap Lookup table, already constructed. [in] * @param word_options options for initial word finding. [in] * @param ext_options options for gapped extension. [in] * @param hit_options options for saving hits. [in] * @param query The query sequence block [in] * @param query_info The query information block [in] * @param sbp Contains scoring information. [in] * @param gap_align Gapped alignment information and allocated memory [out] * @param score_params Parameters for scoring [out] * @param word_params Parameters for initial word processing [out] * @param ext_params Parameters for gapped extension [out] * @param hit_params Parameters for saving hits [out] * @param eff_len_params Parameters for calculating effective lengths [out] * @param aux_struct_ptr Placeholder joining various auxiliary memory * structures [out] */static Int2 BLAST_SetUpAuxStructures(Uint1 program_number, const BlastSeqSrc* seq_src, const BlastScoringOptions* scoring_options, const BlastEffectiveLengthsOptions* eff_len_options, LookupTableWrap* lookup_wrap, const BlastInitialWordOptions* word_options, const BlastExtensionOptions* ext_options, const BlastHitSavingOptions* hit_options, BLAST_SequenceBlk* query, BlastQueryInfo* query_info, BlastScoreBlk* sbp,
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -