Система управления распознаванием речевой информации (стр. 10 из 10)

WaveOutBuffer Buf = new WaveOutBuffer(m_WaveOut, bufferSize);

Prev.NextBuffer = Buf;

Prev = Buf;

}

finally

{

Prev.NextBuffer = m_Buffers;

}

private void FreeBuffers()

{

m_CurrentBuffer = null;

if (m_Buffers != null)

{

WaveOutBuffer First = m_Buffers;

m_Buffers = null;

WaveOutBuffer Current = First;

{

WaveOutBuffer Next = Current.NextBuffer;

Current.Dispose();

Current = Next;

} while(Current != First);

}

private void Advance()

{

m_CurrentBuffer = m_CurrentBuffer == null ? m_Buffers : m_CurrentBuffer.NextBuffer;

m_CurrentBuffer.WaitFor();

}

private void WaitForAllBuffers()

{

WaveOutBuffer Buf = m_Buffers;

while (Buf.NextBuffer != m_Buffers)

{

Buf.WaitFor();

Buf = Buf.NextBuffer;

}

1.3) SignalGenerator.cs

// Speech recognition

// singal generator => to generate various signals like sawtooth…

using System;

using System.Collections.Generic;

using System.Text;

namespace SoundViewer

{

class SignalGenerator

{

private string _waveForm = "Sine";

private double _amplitude = 128.0;

private double _samplingRate = 44100;

private double _frequency = 5000.0;

private double _dcLevel = 0.0;

private double _noise = 0.0;

private int _samples = 16384;

private bool _addDCLevel = false;

private bool _addNoise = false;

public SignalGenerator()

{

}

public void SetWaveform(string waveForm)

{

_waveForm = waveForm;

}

public String GetWaveform()

{

return _waveForm;

}

public void SetAmplitude(double amplitude)

{

_amplitude = amplitude;

}

public double GetAmplitude()

{

return _amplitude;

}

public void SetFrequency(double frequency)

{

_frequency = frequency;

}

public double GetFrequency()

{

return _frequency;

}

public void SetSamplingRate(double rate)

{

_samplingRate = rate;

}

public double GetSamplingRate()

{

return _samplingRate;

}

public void SetSamples(int samples)

{

_samples = samples;

}

public int GetSamples()

{

return _samples;

}

public void SetDCLevel(double dc)

{

_dcLevel = dc;

}

public double GetDCLevel()

{

return _dcLevel;

}

public void SetNoise(double noise)

{

_noise = noise;

}

public double GetNoise()

{

return _noise;

}

public void SetDCLevelState(bool dcstate)

{

_addDCLevel = dcstate;

}

public bool IsDCLevel()

{

return _addDCLevel;

}

public void SetNoiseState(bool noisestate)

{

_addNoise = noisestate;

}

public bool IsNoise()

{

return _addNoise;

}

public double[] GenerateSignal()

{

double[] values = new double[_samples];

if (_waveForm.Equals("Sine"))

{

double theta = 2.0 * Math.PI * _frequency / _samplingRate;

for (int i = 0; i < _samples; i++)

{

values[i] = _amplitude * Math.Sin(i * theta);

}

if (_waveForm.Equals("Cosine"))

{

double theta = 2.0f * (double)Math.PI * _frequency / _samplingRate;

for (int i = 0; i < _samples; i++)

values[i] = _amplitude * Math.Cos(i * theta);

}

if (_waveForm.Equals("Square"))

{

double p = 2.0 * _frequency / _samplingRate;

for (int i = 0; i < _samples; i++)

values[i] = Math.Round(i * p) % 2 == 0 ? _amplitude : -_amplitude;

}

if (_waveForm.Equals("Triangular"))

{

double p = 2.0 * _frequency / _samplingRate;

for (int i = 0; i < _samples; i++)

{

int ip = (int)Math.Round(i * p);

values[i] = 2.0 * _amplitude * (1 - 2 * (ip % 2)) * (i * p - ip);

}

if (_waveForm.Equals("Sawtooth"))

{

for (int i = 0; i < _samples; i++)

{

double q = i * _frequency / _samplingRate;

values[i] = 2.0 * _amplitude * (q - Math.Round(q));

}

if (_addDCLevel)

{

for (int i = 0; i < _samples; i++)

values[i] += _dcLevel;

}

if (_addNoise)

{

Random r = new Random();

for (int i = 0; i < _samples; i++)

values[i] += _noise * r.Next();

}

return values;

}

1.4)AudioFrame.cs

// Speech recognition

// audioframe => working on audio frame

using System;

using System.Drawing;

using System.Windows.Forms;

namespace SoundViewer

{

class AudioFrame

{

private Bitmap _canvasTimeDomain;

private Bitmap _canvasFrequencyDomain;

private double[] _waveLeft;

private double[] _waveRight;

private double[] _fftLeft;

private double[] _ftRight;

private SignalGenerator _signalGenerator;

private bool _isTest = false;

public AudioFrame(bool isTest)

{

_isTest = isTest;

}

/// <summary>

/// Process 16 bit sample

/// </summary>

/// <param name="wave"></param>

public void Process(ref byte[] wave)

{

_waveLeft = new double[wave.Length / 4];

_waveRight = new double[wave.Length / 4];

if (_isTest == false)

{

// Split out channels from sample

int h = 0;

for (int i = 0; i < wave.Length; i += 4)

{

_waveLeft[h] = (double)BitConverter.ToInt16(wave, i);

_waveRight[h] = (double)BitConverter.ToInt16(wave, i + 2);

h++;

}

else

{

// Generate artificial sample for testing

_signalGenerator = new SignalGenerator();

_signalGenerator.SetWaveform("Sine");

_signalGenerator.SetSamplingRate(44100);

_signalGenerator.SetSamples(16384);

_signalGenerator.SetFrequency(5000);

_waveLeft = _signalGenerator.GenerateSignal();

_waveRight = _signalGenerator.GenerateSignal();

}

// Generate frequency domain data in decibels

_fftLeft = FourierTransform.FFTDb(ref _waveLeft);

_fftRight = FourierTransform.FFTDb(ref _waveRight);

}

/// Render time domain to PictureBox

public void RenderTimeDomain(ref PictureBox pictureBox)

{

// Set up for drawing

_canvasTimeDomain = new Bitmap(pictureBox.Width, pictureBox.Height);

Graphics offScreenDC = Graphics.FromImage(_canvasTimeDomain);

SolidBrush brush = new System.Drawing.SolidBrush(Color.FromArgb(0, 0, 0));

Pen pen = new System.Drawing.Pen(Color.WhiteSmoke);

// Determine channnel boundries

int width = _canvasTimeDomain.Width;

int center = _canvasTimeDomain.Height / 2;

int height = _canvasTimeDomain.Height;

offScreenDC.DrawLine(pen, 0, center, width, center);

int leftLeft = 0;

int leftTop = 0;

int leftRight = width;

int leftBottom = center - 1;

int rightLeft = 0;

int rightTop = center + 1;

int rightRight = width;

int rightBottom = height;

// Draw left channel

double yCenterLeft = (leftBottom - leftTop) / 2;

double yScaleLeft = 0.5 * (leftBottom - leftTop) / 32768; // a 16 bit sample has values from -32768 to 32767

int xPrevLeft = 0, yPrevLeft = 0;

for (int xAxis = leftLeft; xAxis < leftRight; xAxis++)

{

int yAxis = (int)(yCenterLeft + (_waveLeft[_waveLeft.Length / (leftRight - leftLeft) * xAxis] * yScaleLeft));

if (xAxis == 0)

{

xPrevLeft = 0;

yPrevLeft = yAxis;

}

else

{

pen.Color = Color.LimeGreen;

offScreenDC.DrawLine(pen, xPrevLeft, yPrevLeft, xAxis, yAxis);

xPrevLeft = xAxis;

yPrevLeft = yAxis;

}

// Draw right channel

int xCenterRight = rightTop + ((rightBottom - rightTop) / 2);

double yScaleRight = 0.5 * (rightBottom - rightTop) / 32768; // a 16 bit sample has values from -32768 to 32767

int xPrevRight = 0, yPrevRight = 0;

for (int xAxis = rightLeft; xAxis < rightRight; xAxis++)

{

int yAxis = (int)(xCenterRight + (_waveRight[_waveRight.Length / (rightRight - rightLeft) * xAxis] * yScaleRight));

if (xAxis == 0)

{

xPrevRight = 0;

yPrevRight = yAxis;

}

else

{

pen.Color = Color.LimeGreen;

offScreenDC.DrawLine(pen, xPrevRight, yPrevRight, xAxis, yAxis);

xPrevRight = xAxis;

yPrevRight = yAxis;

}

// Clean up

pictureBox.Image = _canvasTimeDomain;

offScreenDC.Dispose();

}

/// <summary>

/// Render frequency domain to PictureBox

/// </summary>

/// <param name="pictureBox"></param>

public void RenderFrequencyDomain(ref PictureBox pictureBox)

{

// Set up for drawing

_canvasFrequencyDomain = new Bitmap(pictureBox.Width, pictureBox.Height);

Graphics offScreenDC = Graphics.FromImage(_canvasFrequencyDomain);

SolidBrush brush = new System.Drawing.SolidBrush(Color.FromArgb(0, 0, 0));

Pen pen = new System.Drawing.Pen(Color.WhiteSmoke);

// Determine channnel boundries

int width = _canvasFrequencyDomain.Width;

int center = _canvasFrequencyDomain.Height / 2;

int height = _canvasFrequencyDomain.Height;

offScreenDC.DrawLine(pen, 0, center, width, center);

int leftLeft = 0;

int leftTop = 0;

int leftRight = width;

int leftBottom = center - 1;

int rightLeft = 0;

int rightTop = center + 1;

int rightRight = width;

int rightBottom = height;

// Draw left channel

for (int xAxis = leftLeft; xAxis < leftRight; xAxis++)

{

double amplitude = (int)_fftLeft[(int)(((double)(_fftLeft.Length) / (double)(width)) * xAxis)];

if (amplitude < 0) // Drop negative values

amplitude = 0;

int yAxis = (int)(leftTop + ((leftBottom - leftTop) * amplitude) / 100); // Arbitrary factor

pen.Color = Color.FromArgb(120, 120, (int)amplitude % 255);

offScreenDC.DrawLine(pen, xAxis, leftTop, xAxis, yAxis);

}

// Draw right channel

for (int xAxis = rightLeft; xAxis < rightRight; xAxis++)

{

double amplitude = (int)_fftRight[(int)(((double)(_fftRight.Length) / (double)(width)) * xAxis)];

if (amplitude < 0)

amplitude = 0;

int yAxis = (int)(rightBottom - ((rightBottom - rightTop) * amplitude) / 100);

pen.Color = Color.FromArgb(120, 120, (int)amplitude % 255);

offScreenDC.DrawLine(pen, xAxis, rightBottom, xAxis, yAxis);

}

// Clean up

pictureBox.Image = _canvasFrequencyDomain;

offScreenDC.Dispose();

}

void WaveIn(short* buf, int len)

{

//raspoznavat

}

2. Листингпрограммы– Speech Recognition (Matlab)

2.1)CMN.m

function NormMatrix = CMN(Matrix)

[r,c]=size(Matrix);

NormMatrix=zeros(г,c);

for i=1:c

MatMean=mean(Matrix(:,i)); %Derives mean for each column i in utterance

NormMatrix(:,i)=Matrix(:,i)-MatMean; %Subtracts mean from each element in

End

2.2) Recognition.m

clear all;

close all;

ncoeff = 13; %Required number of mfcc coefficients

N = 20; %Number of words in vocabulary

k = 3; %Number of nearest neighbors to choose

fs=16000; %Sampling rate

duration1 = 0.1; %Initial silence duration in seconds

duration2 = 2; %Recording duration in seconds

G=2; %vary this factor to compensate for amplitude variations

NSpeakers = 5; %Number of training speakers

fprintf('Press any key to start %g seconds of speech recording...', duration2);

pause;

silence = wavrecord(duration1*fs, fs);

fprintf('Recording speech...');

speechIn = wavrecord(duration2*fs, fs); % duration*fs is the total number of sample points

fprintf('Finlshed recording.\n');

fprintf('System is trying to recognize what you have spoken...\n');

speechIn1 = [silence;speechIn]; %pads with 150 ms silence

speechIn2 = speechIn1.*G;

speechIn3 = speechIn2 - mean(speechIn2); %DC offset elimination

speechIn = nreduce(speechIn3,fs); %Applies spectral subtraction

rMatrix1 = mfccf(ncoeff,speechIn,fs); %Compute test feature vector

rMatrix = CMN(rMatrix1); %Removes convolutional noise

Sco = DTWScores(rMatrix,N); %computes all DTW scores

[SortedScores,EIndex] = sort(Sco); %Sort scores increasing

K_Vector = EIndex(1:k); %Gets k lowest scores

Neighbors = zeros(1,k); %will hold k-N neighbors

for t = 1:k

u = K_Vector(t);

for r = 1:NSpeakers-1

if u <= (N)

break

else u = u - (N);

end

Neighbors(t) = N;

end

%Apply k-Nearest Neighbor rule

Nbr = Neighbors

%sortk = sort(Nbr);

[Modal.Freq] = mode(Nbr); %most frequent value

Word = strvcat('One','Two','Three','Four','Five','Six','Seven','Eight','Nine','Ten','Yes','No','Hello','Open','Close','Start','Stop','Dial','On','Off');

if mean(abs(speechIn)) < 0.01

fprintf('No microphone connected or you have not said anything.\n');

elseif ((k/Freq) > 2) %if no majority

fprintf('The word you have said could not be properly recognised.\n');

else

fprintf('You have just said %s.\n',Word(Modal,:)); %Prints recognized word

end

2.3)setTemplates.m

ncoeff=13; %Required number of mfcc coefficients

fMatrix1 = cell(1,20);

fMatrix2 = cell(1,20);

fMatrix3 = cell(1,20);

fMatrix4 = cell(1,20);

for j = 1:20

q = ['C:\SpeechData\Amir\5_' num2str(j) '.wav'];

[speechIn1,FS1] = wavread(q);

speechIn1 = myVAD(speechIn1); %Speech endpoint trimming

fMatrix1(1,j) = {mfccf(ncoeff,speechIn1,FS1)}; %MFCC coefficients are

%computed here

end

for k = 1:20

q = ['C:\SpeechData\Ayo\5_' num2str(k) '.wav'];

[speechIn2,FS2] = wavread(q);

speechIn2 = myVAD(speechIn2);

fMatrix2(1,k) = {mfcvcf(ncoeff,speechIn2,FS2)};

end

for l = 1:20

q = ['C:\SpeechData\Sameh\5_' num2str(l) '.wav'];

[speechIn3,F3] = wavread(q);

speechIn3 = myVAD(speechIn3);

fMatrix3(1,l) = {mfccf(ncoeff,speechIn3,FS3)};

end

for m = 1:20

q = ['C:\SpeechData\Jim\5_' num2str(m) '.wav'];

[speechIn4,FS4] = wavread(q);

speechIn4 = myVAD(speechIn4);

fMatrix4(1,m) = {mfccf(ncoeff,speechIn4,FS4)};

end

for n = 1:20

q = ['C:\SpeechData\Tope\5_' num2str(n) '.wav'];

[speechIn5,FS5] = wavread(q);

speechIn5 = myVAD(speechIn5);

fMatrix5(1,n) = {mfccf(ncoeff,speechIn5,FS5)};

end

%Converts the cells containing all matrices to structures and save

%structures in matlab .mat files in the working directory.

fields = {'One','Two','Three','Four','Five','Six','Seven','Eight','Nine','Ten','Yes','No','Hello','Open','Close','Start','Stop','Dial','On','Off'};

s1 = cell2struct(fMatrix1, fields, 2);

save Vectors1.mat -struct s1;

s2 = cell2struct(fMatrix2, fields, 2);

save Vectors2.mat -struct s2;

s3 = cell2struct(fMatrix3, fields, 2);

save Vectors3.mat -struct s3;

s4 = cell2struct(fMatrix4, fields, 2);

save Vectors4.mat -struct s4;

s5 = cell2struct(fMatrix5, fields, 2);

save Vectors5.mat -struct s5;