📄 melfrequencyfilterbank.java
字号:
/* * Copyright 1999-2002 Carnegie Mellon University. * Portions Copyright 2002 Sun Microsystems, Inc. * Portions Copyright 2002 Mitsubishi Electric Research Laboratories. * All Rights Reserved. Use is subject to license terms. * * See the file "license.terms" for information on usage and * redistribution of this file, and for a DISCLAIMER OF ALL * WARRANTIES. * */package edu.cmu.sphinx.frontend.frequencywarp;import edu.cmu.sphinx.frontend.BaseDataProcessor;import edu.cmu.sphinx.frontend.Data;import edu.cmu.sphinx.frontend.DataProcessingException;import edu.cmu.sphinx.frontend.DoubleData;import edu.cmu.sphinx.util.props.PropertyException;import edu.cmu.sphinx.util.props.PropertySheet;import edu.cmu.sphinx.util.props.PropertyType;import edu.cmu.sphinx.util.props.Registry;/** * Filters an input power spectrum through a bank of number of mel-filters. The * output is an array of filtered values, typically called mel-spectrum, each * corresponding to the result of filtering the input spectrum through an * individual filter. Therefore, the length of the output array is equal to the * number of filters created. * <p> * The triangular mel-filters in the filter bank are placed in the frequency * axis so that each filter's center frequency follows the mel scale, in such a * way that the filter bank mimics the critical band, which represents * different perceptual effect at different frequency bands. Additionally, the * edges are placed so that they coincide with the center frequencies in * adjacent filters. Pictorially, the filter bank looks like: * <p> * <img src="doc-files/melfilterbank.jpg"> <br> * <center><b>Figure 1: A Mel-filter bank. </b> </center> * <p> * As you might notice in the above figure, the distance at the base from the * center to the left edge is different from the center to the right edge. * Since the center frequencies follow the mel-frequency scale, which is a * non-linear scale that models the non-linear human hearing behavior, the mel * filter bank corresponds to a warping of the frequency axis. As can be * inferred from the figure, filtering with the mel scale emphasizes the lower * frequencies. A common model for the relation between frequencies in mel and * linear scales is as follows: * <p> * <code>melFrequency = 2595 * log(1 + linearFrequency/700)</code> * <p> * The constants that define the filterbank are the number of filters, the * minimum frequency, and the maximum frequency. The minimum and maximum * frequencies determine the frequency range spanned by the filterbank. These * frequencies depend on the channel and the sampling frequency that you are * using. For telephone speech, since the telephone channel corresponds to a * bandpass filter with cutoff frequencies of around 300Hz and 3700Hz, using * limits wider than these would waste bandwidth. For clean speech, the minimum * frequency should be higher than about 100Hz, since there is no speech * information below it. Furthermore, by setting the minimum frequency above * 50/60Hz, we get rid of the hum resulting from the AC power, if present. * <p> * The maximum frequency has to be lower than the Nyquist frequency, that is, * half the sampling rate. Furthermore, there is not much information above * 6800Hz that can be used for improving separation between models. * Particularly for very noisy channels, maximum frequency of around 5000Hz may * help cut off the noise. * <p> * Typical values for the constants defining the filter bank are: <table * width="80%" border="1"> * <tr> * <td><b>Sample rate (Hz) </b></td> * <td><b>16000 </b></td> * <td><b>11025 </b></td> * <td><b>8000 </b></td> * </tr> * <tr> * <td>{@link #PROP_NUMBER_FILTERS numberFilters}</td> * <td>40</td> * <td>36</td> * <td>31</td> * </tr> * <tr> * <td>{@link #PROP_MIN_FREQ minimumFrequency}(Hz)</td> * <td>130</td> * <td>130</td> * <td>200</td> * </tr> * <tr> * <td>{@link #PROP_MAX_FREQ maximumFrequency}(Hz)</td> * <td>6800</td> * <td>5400</td> * <td>3500</td> * </tr> * </table> * <p> * Davis and Mermelstein showed that Mel-frequency cepstral coefficients * present robust characteristics that are good for speech recognition. For * details, see Davis and Mermelstein, <i>Comparison of Parametric * Representations for Monosyllable Word Recognition in Continuously Spoken * Sentences, IEEE Transactions on Acoustic, Speech and Signal Processing, 1980 * </i>. * * @see MelFilter */public class MelFrequencyFilterBank extends BaseDataProcessor { /** * The name of the Sphinx Property for the number of filters in the * filterbank. */ public static final String PROP_NUMBER_FILTERS = "numberFilters"; /** * The default value for PROP_NUMBER_FILTERS. */ public static final int PROP_NUMBER_FILTERS_DEFAULT = 40; /** * The name of the Sphinx Property for the minimum frequency covered by the * filterbank. */ public static final String PROP_MIN_FREQ = "minimumFrequency"; /** * The default value of PROP_MIN_FREQ. */ public static final double PROP_MIN_FREQ_DEFAULT = 130.0; /** * The name of the Sphinx Property for the maximum frequency covered by the * filterbank. */ public static final String PROP_MAX_FREQ = "maximumFrequency"; /** * The default value of PROP_MAX_FREQ. */ public static final double PROP_MAX_FREQ_DEFAULT = 6800.0; // ---------------------------------- // Configuration data // ---------------------------------- private int sampleRate; private int numberFftPoints; private int numberFilters; private double minFreq; private double maxFreq; private MelFilter[] filter; /* * (non-Javadoc) * * @see edu.cmu.sphinx.util.props.Configurable#register(java.lang.String, * edu.cmu.sphinx.util.props.Registry) */ public void register(String name, Registry registry) throws PropertyException { super.register(name, registry); registry.register(PROP_MIN_FREQ, PropertyType.DOUBLE); registry.register(PROP_MAX_FREQ, PropertyType.DOUBLE); registry.register(PROP_NUMBER_FILTERS, PropertyType.INT); } /* * (non-Javadoc) * * @see edu.cmu.sphinx.util.props.Configurable#newProperties(edu.cmu.sphinx.util.props.PropertySheet) */ public void newProperties(PropertySheet ps) throws PropertyException { super.newProperties(ps); minFreq = ps.getDouble(PROP_MIN_FREQ, PROP_MIN_FREQ_DEFAULT); maxFreq = ps.getDouble(PROP_MAX_FREQ, PROP_MAX_FREQ_DEFAULT); numberFilters = ps.getInt(PROP_NUMBER_FILTERS, PROP_NUMBER_FILTERS_DEFAULT); } /* * (non-Javadoc) * * @see edu.cmu.sphinx.frontend.DataProcessor#initialize(edu.cmu.sphinx.frontend.CommonConfig) */ public void initialize() { super.initialize(); } /** * Compute mel frequency from linear frequency. * * Since we don't have <code>log10()</code>, we have to compute it using * natural log: <b>log10(x) = ln(x) / ln(10) </b> * * @param inputFreq * the input frequency in linear scale * * @return the frequency in a mel scale * */ private double linToMelFreq(double inputFreq) { return (2595.0 * (Math.log(1.0 + inputFreq / 700.0) / Math.log(10.0))); } /** * Compute linear frequency from mel frequency. * * @param inputFreq * the input frequency in mel scale * * @return the frequency in a linear scale * */ private double melToLinFreq(double inputFreq) { return (700.0 * (Math.pow(10.0, (inputFreq / 2595.0)) - 1.0)); } /** * Sets the given frequency to the nearest frequency bin from the FFT. The * FFT can be thought of as a sampling of the actual spectrum of a signal. * We use this function to find the sampling point of the spectrum that is * closest to the given frequency. * * @param inFreq * the input frequency * @param stepFreq * the distance between frequency bins * * @return the closest frequency bin * * @throws IllegalArgumentException */ private double setToNearestFrequencyBin(double inFreq, double stepFreq) throws IllegalArgumentException { if (stepFreq == 0) { throw new IllegalArgumentException("stepFreq is zero"); } return stepFreq * Math.round(inFreq / stepFreq); } /** * Build a mel filterbank with the parameters given. Each filter will be * shaped as a triangle. The triangles overlap so that they cover the whole * frequency range requested. The edges of a given triangle will be by * default at the center of the neighboring triangles. * * @param numberFftPoints * number of points in the power spectrum * @param numberFilters * number of filters in the filterbank * @param minFreq * lowest frequency in the range of interest * @param maxFreq * highest frequency in the range of interest * * @throws IllegalArgumentException */ private void buildFilterbank(int numberFftPoints, int numberFilters, double minFreq, double maxFreq) throws IllegalArgumentException { double minFreqMel; double maxFreqMel; double deltaFreqMel; double[] leftEdge = new double[numberFilters]; double[] centerFreq = new double[numberFilters]; double[] rightEdge = new double[numberFilters]; double nextEdgeMel; double nextEdge; double initialFreqBin; double deltaFreq; this.filter = new MelFilter[numberFilters]; /** * In fact, the ratio should be between <code>sampleRate / * 2</code> * and <code>numberFftPoints / 2</code> since the number of points in * the power spectrum is half of the number of FFT points - the other * half would be symmetrical for a real sequence -, and these points * cover up to the Nyquist frequency, which is half of the sampling * rate. The two "divide by 2" get canceled out. */ if (numberFftPoints == 0) { throw new IllegalArgumentException("Number of FFT points is zero"); } deltaFreq = (double) sampleRate / numberFftPoints; /** * Initialize edges and center freq. These variables will be updated so * that the center frequency of a filter is the right edge of the * filter to its left, and the left edge of the filter to its right. */ if (numberFilters < 1) { throw new IllegalArgumentException("Number of filters illegal: " + numberFilters); } minFreqMel = linToMelFreq(minFreq); maxFreqMel = linToMelFreq(maxFreq); deltaFreqMel = (maxFreqMel - minFreqMel) / (numberFilters + 1); leftEdge[0] = setToNearestFrequencyBin(minFreq, deltaFreq); nextEdgeMel = minFreqMel; for (int i = 0; i < numberFilters; i++) { nextEdgeMel += deltaFreqMel; nextEdge = melToLinFreq(nextEdgeMel); centerFreq[i] = setToNearestFrequencyBin(nextEdge, deltaFreq); if (i > 0) { rightEdge[i - 1] = centerFreq[i]; } if (i < numberFilters - 1) { leftEdge[i + 1] = centerFreq[i]; } } nextEdgeMel = nextEdgeMel + deltaFreqMel; nextEdge = melToLinFreq(nextEdgeMel); rightEdge[numberFilters - 1] = setToNearestFrequencyBin(nextEdge, deltaFreq); for (int i = 0; i < numberFilters; i++) { initialFreqBin = setToNearestFrequencyBin(leftEdge[i], deltaFreq); if (initialFreqBin < leftEdge[i]) { initialFreqBin += deltaFreq; } this.filter[i] = new MelFilter(leftEdge[i], centerFreq[i], rightEdge[i], initialFreqBin, deltaFreq); } } /** * Process data, creating the power spectrum from an input audio frame. * * @param input * input power spectrum * * @return power spectrum * * @throws java.lang.IllegalArgumentException */ private DoubleData process(DoubleData input) throws IllegalArgumentException { double[] in = input.getValues(); if (filter == null || sampleRate != input.getSampleRate()) { numberFftPoints = (in.length - 1) * 2; sampleRate = input.getSampleRate(); buildFilterbank(numberFftPoints, numberFilters, minFreq, maxFreq); } else if (in.length != ((numberFftPoints >> 1) + 1)) { throw new IllegalArgumentException( "Window size is incorrect: in.length == " + in.length + ", numberFftPoints == " + ((numberFftPoints >> 1) + 1)); } double[] output = new double[numberFilters]; /** * Filter input power spectrum */ for (int i = 0; i < numberFilters; i++) { output[i] = filter[i].filterOutput(in); } DoubleData outputMelSpectrum = new DoubleData(output, sampleRate, input.getCollectTime(), input.getFirstSampleNumber()); return outputMelSpectrum; } /** * Reads the next Data object, which is the power spectrum of an audio * input frame. Signals are returned unmodified. * * @return the next available Data or Signal object, or returns null if no * Data is available * * @throws DataProcessingException * if there is a data processing error */ public Data getData() throws DataProcessingException { Data input = getPredecessor().getData(); getTimer().start(); if (input != null) { if (input instanceof DoubleData) { input = process((DoubleData) input); } } getTimer().stop(); return input; }}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -