How do I implement speech recognition in my skeleton tracking program?

I am working with the Kinect v1. I wrote some code that involved the calculation of the angles between the arms and the body. Now I want to implement speech recognition, but I have failed so far.

My goal: Give a command to the Kinect. This command triggers a function, that does stuff. The stuff it does is irrelevant for now. This is my code:

namespace Kinect_Robot
    using System.IO;
    using System.IO.Ports;
    using System.Threading;
    using System;
    using System.Windows;
    using System.Windows.Forms;
    using System.Windows.Media;
    using Microsoft.Kinect;
    using Microsoft.Kinect.Toolkit.Fusion;
    using System.Windows.Media.Media3D;

    using Microsoft.Speech.AudioFormat;
    using Microsoft.Speech.Recognition;
    using System.Windows.Documents;
    using System.Text;
    using System.ComponentModel;
    using System.Collections.Generic;

    public class Angles
        public double AngleBetweenTwoVectors(Vector3D vectorA, Vector3D vectorB)
            double dotProduct;
            dotProduct = Vector3D.DotProduct(vectorA, vectorB);

            return (double)Math.Acos(dotProduct) / Math.PI * 180;

        public byte[] GetVector(Skeleton skeleton)
            Vector3D ShoulderCenter = new Vector3D(skeleton.Joints[JointType.ShoulderCenter].Position.X, skeleton.Joints[JointType.ShoulderCenter].Position.Y, skeleton.Joints[JointType.ShoulderCenter].Position.Z);
            Vector3D RightShoulder = new Vector3D(skeleton.Joints[JointType.ShoulderRight].Position.X, skeleton.Joints[JointType.ShoulderRight].Position.Y, skeleton.Joints[JointType.ShoulderRight].Position.Z);
            Vector3D LeftShoulder = new Vector3D(skeleton.Joints[JointType.ShoulderLeft].Position.X, skeleton.Joints[JointType.ShoulderLeft].Position.Y, skeleton.Joints[JointType.ShoulderLeft].Position.Z);
            Vector3D RightElbow = new Vector3D(skeleton.Joints[JointType.ElbowRight].Position.X, skeleton.Joints[JointType.ElbowRight].Position.Y, skeleton.Joints[JointType.ElbowRight].Position.Z);
            Vector3D LeftElbow = new Vector3D(skeleton.Joints[JointType.ElbowLeft].Position.X, skeleton.Joints[JointType.ElbowLeft].Position.Y, skeleton.Joints[JointType.ElbowLeft].Position.Z);
            Vector3D RightWrist = new Vector3D(skeleton.Joints[JointType.WristRight].Position.X, skeleton.Joints[JointType.WristRight].Position.Y, skeleton.Joints[JointType.WristRight].Position.Z);
            Vector3D LeftWrist = new Vector3D(skeleton.Joints[JointType.WristLeft].Position.X, skeleton.Joints[JointType.WristLeft].Position.Y, skeleton.Joints[JointType.WristLeft].Position.Z);
            Vector3D UpVector = new Vector3D(0.0, 1.0, 0.0);

            double AngleRightElbow = AngleBetweenTwoVectors(RightElbow - RightShoulder, RightElbow - RightWrist);
            double AngleRightShoulder = AngleBetweenTwoVectors(UpVector, RightShoulder - RightElbow);
            double AngleLeftElbow = AngleBetweenTwoVectors(LeftElbow - LeftShoulder, LeftElbow - LeftWrist);
            double AngleLeftShoulder = AngleBetweenTwoVectors(UpVector, LeftShoulder - LeftElbow);

            byte[] Angles = { Convert.ToByte(AngleRightElbow), Convert.ToByte(180 - AngleRightShoulder), Convert.ToByte(180 - AngleLeftElbow), Convert.ToByte(AngleLeftShoulder) };
            return Angles;

    /// Interaction logic for MainWindow.xaml
    public partial class  MainWindow : Window

        SerialPort ArduinoPort;
        public Boolean PortOpen = false;

        /// Width of output drawing
        private const float RenderWidth = 640.0f;

        /// Height of our output drawing
        private const float RenderHeight = 480.0f;

        /// Thickness of drawn joint lines
        private const double JointThickness = 3;

        /// Thickness of body center ellipse
        private const double BodyCenterThickness = 10;

        /// Thickness of clip edge rectangles
        private const double ClipBoundsThickness = 10;

        /// Brush used to draw skeleton center point
        private readonly Brush centerPointBrush = Brushes.Blue;

        /// Brush used for drawing joints that are currently tracked
        private readonly Brush trackedJointBrush = new SolidColorBrush(Color.FromArgb(255, 68, 192, 68));

        /// Brush used for drawing joints that are currently inferred
        private readonly Brush inferredJointBrush = Brushes.Yellow;

        /// Pen used for drawing bones that are currently tracked
        private readonly Pen trackedBonePen = new Pen(Brushes.Green, 6);

        /// Pen used for drawing bones that are currently inferred
        private readonly Pen inferredBonePen = new Pen(Brushes.Gray, 1);

        /// Active Kinect sensor
        private KinectSensor sensor;

        private SpeechRecognitionEngine speechEngine;

        /// Drawing group for skeleton rendering output
        private DrawingGroup drawingGroup;

        /// Drawing image that we will display
        private DrawingImage imageSource;

        /// Initializes a new instance of the MainWindow class.
        public MainWindow()

        private static RecognizerInfo GetKinectRecognizer()
            foreach (RecognizerInfo recognizer in SpeechRecognitionEngine.InstalledRecognizers())
                string value;
                recognizer.AdditionalInfo.TryGetValue("Kinect", out value);
                if ("True".Equals(value, StringComparison.OrdinalIgnoreCase) && "en-US".Equals(recognizer.Culture.Name, StringComparison.OrdinalIgnoreCase))
                    return recognizer;

            return null;

        private static void RenderClippedEdges(Skeleton skeleton, DrawingContext drawingContext)
            if (skeleton.ClippedEdges.HasFlag(FrameEdges.Bottom))
                    new Rect(0, RenderHeight - ClipBoundsThickness, RenderWidth, ClipBoundsThickness));

            if (skeleton.ClippedEdges.HasFlag(FrameEdges.Top))
                    new Rect(0, 0, RenderWidth, ClipBoundsThickness));

            if (skeleton.ClippedEdges.HasFlag(FrameEdges.Left))
                    new Rect(0, 0, ClipBoundsThickness, RenderHeight));

            if (skeleton.ClippedEdges.HasFlag(FrameEdges.Right))
                    new Rect(RenderWidth - ClipBoundsThickness, 0, ClipBoundsThickness, RenderHeight));

        private void WindowLoaded(object sender, RoutedEventArgs e)

            // Create the drawing group we'll use for drawing
            this.drawingGroup = new DrawingGroup();

            // Create an image source that we can use in our image control
            this.imageSource = new DrawingImage(this.drawingGroup);

            // Display the drawing using our image control
            Image.Source = this.imageSource;

            // Look through all sensors and start the first connected one.
            // This requires that a Kinect is connected at the time of app startup.
            // To make your app robust against plug/unplug, 
            // it is recommended to use KinectSensorChooser provided in Microsoft.Kinect.Toolkit (See components in Toolkit Browser).
            foreach (var potentialSensor in KinectSensor.KinectSensors)
                if (potentialSensor.Status == KinectStatus.Connected)
                    this.sensor = potentialSensor;

            if (null != this.sensor)
                // Turn on the skeleton stream to receive skeleton frames

                // Add an event handler to be called whenever there is new color frame data
                this.sensor.SkeletonFrameReady += this.SensorSkeletonFrameReady;

                // Start the sensor!
                catch (IOException)
                    this.sensor = null;

            if (null == this.sensor)
                this.statusBarText.Text = Properties.Resources.NoKinectReady;

            RecognizerInfo ri = GetKinectRecognizer();

            if (null != ri)
                this.speechEngine = new SpeechRecognitionEngine(ri.Id);
                using (var memoryStream = new MemoryStream(Encoding.ASCII.GetBytes(Properties.Resources.SpeechGrammar)))
                    var g = new Grammar(memoryStream);

                speechEngine.SpeechRecognized += SpeechRecognized;
                speechEngine.SpeechRecognitionRejected += SpeechRejected;
                    sensor.AudioSource.Start(), new SpeechAudioFormatInfo(EncodingFormat.Pcm, 16000, 16, 1, 32000, 2, null));
                this.statusBarText.Text = Properties.Resources.NoSpeechRecognizer;

        private void WindowClosing(object sender, System.ComponentModel.CancelEventArgs e)

            if (null != this.sensor)
                catch (NullReferenceException) { };

                this.sensor = null;
            if (null != this.speechEngine)
                this.speechEngine.SpeechRecognized -= SpeechRecognized;
                this.speechEngine.SpeechRecognitionRejected -= SpeechRejected;

        private void SpeechRecognized(object sender, SpeechRecognizedEventArgs e)
            // Speech utterance confidence below which we treat speech as if it hadn't been heard
            const double ConfidenceThreshold = 0.3;

            if (e.Result.Confidence >= ConfidenceThreshold)
                switch (e.Result.Semantics.Value.ToString())
                    case "SPREAD":
                        this.CommandTextBox.Text = "Spread";

                    case "SOMETHING1":


                    case "SOMETHING2":


                    case "SOMETHING3":


        private void SpeechRejected(object sender, SpeechRecognitionRejectedEventArgs e)


        private void SensorSkeletonFrameReady(object sender, SkeletonFrameReadyEventArgs e)
            Skeleton[] skeletons = new Skeleton[0];

            using (SkeletonFrame skeletonFrame = e.OpenSkeletonFrame())
                if (skeletonFrame != null)
                    skeletons = new Skeleton[skeletonFrame.SkeletonArrayLength];


            using (DrawingContext dc = this.drawingGroup.Open())
                // Draw a transparent background to set the render size
                dc.DrawRectangle(Brushes.Black, null, new Rect(0.0, 0.0, RenderWidth, RenderHeight));

                if (skeletons.Length != 0)
                    foreach (Skeleton skel in skeletons)
                        RenderClippedEdges(skel, dc);

                        if (skel.TrackingState == SkeletonTrackingState.Tracked)
                            this.DrawBonesAndJoints(skel, dc);

                            Angles MyAngles = new Angles(); //Instance of class angles
                            byte[] ReadyAngles = MyAngles.GetVector(skel);//Save angles to byte array and call GetVector
                            RightElbow.Text = ReadyAngles[0].ToString(); //Write Angle into textbox
                            RightShoulder.Text = ReadyAngles[1].ToString();//Write Angle into textbox
                            LeftElbow.Text = ReadyAngles[2].ToString();//Write Angle into textbox
                            LeftShoulder.Text = ReadyAngles[3].ToString();//Write Angle into textbox
                            byte[] SequenceStart = { 255 };

                            //if (ArduinoPort.IsOpen)
                            if (PortOpen)
                                ArduinoPort.Write(SequenceStart, 0, 1);
                                ArduinoPort.Write(ReadyAngles, 0, 4);
                        else if (skel.TrackingState == SkeletonTrackingState.PositionOnly)

                // prevent drawing outside of our render area
                this.drawingGroup.ClipGeometry = new RectangleGeometry(new Rect(0.0, 0.0, RenderWidth, RenderHeight));

        private void DrawBonesAndJoints(Skeleton skeleton, DrawingContext drawingContext)
            // Render Torso
            this.DrawBone(skeleton, drawingContext, JointType.Head, JointType.ShoulderCenter);
            this.DrawBone(skeleton, drawingContext, JointType.ShoulderCenter, JointType.ShoulderLeft);
            this.DrawBone(skeleton, drawingContext, JointType.ShoulderCenter, JointType.ShoulderRight);
            this.DrawBone(skeleton, drawingContext, JointType.ShoulderCenter, JointType.Spine);
            this.DrawBone(skeleton, drawingContext, JointType.Spine, JointType.HipCenter);
            this.DrawBone(skeleton, drawingContext, JointType.HipCenter, JointType.HipLeft);
            this.DrawBone(skeleton, drawingContext, JointType.HipCenter, JointType.HipRight);

            // Left Arm
            this.DrawBone(skeleton, drawingContext, JointType.ShoulderLeft, JointType.ElbowLeft);
            this.DrawBone(skeleton, drawingContext, JointType.ElbowLeft, JointType.WristLeft);
            this.DrawBone(skeleton, drawingContext, JointType.WristLeft, JointType.HandLeft);

            // Right Arm
            this.DrawBone(skeleton, drawingContext, JointType.ShoulderRight, JointType.ElbowRight);
            this.DrawBone(skeleton, drawingContext, JointType.ElbowRight, JointType.WristRight);
            this.DrawBone(skeleton, drawingContext, JointType.WristRight, JointType.HandRight);

            // Left Leg
            this.DrawBone(skeleton, drawingContext, JointType.HipLeft, JointType.KneeLeft);
            this.DrawBone(skeleton, drawingContext, JointType.KneeLeft, JointType.AnkleLeft);
            this.DrawBone(skeleton, drawingContext, JointType.AnkleLeft, JointType.FootLeft);

            // Right Leg
            this.DrawBone(skeleton, drawingContext, JointType.HipRight, JointType.KneeRight);
            this.DrawBone(skeleton, drawingContext, JointType.KneeRight, JointType.AnkleRight);
            this.DrawBone(skeleton, drawingContext, JointType.AnkleRight, JointType.FootRight);

            // Render Joints
            foreach (Joint joint in skeleton.Joints)
                Brush drawBrush = null;

                if (joint.TrackingState == JointTrackingState.Tracked)
                    drawBrush = this.trackedJointBrush;
                else if (joint.TrackingState == JointTrackingState.Inferred)
                    drawBrush = this.inferredJointBrush;

                if (drawBrush != null)
                    drawingContext.DrawEllipse(drawBrush, null, this.SkeletonPointToScreen(joint.Position), JointThickness, JointThickness);

        private Point SkeletonPointToScreen(SkeletonPoint skelpoint)
            // Convert point to depth space.  
            // We are not using depth directly, but we do want the points in our 640x480 output resolution.
            DepthImagePoint depthPoint = this.sensor.CoordinateMapper.MapSkeletonPointToDepthPoint(skelpoint, DepthImageFormat.Resolution640x480Fps30);
            return new Point(depthPoint.X, depthPoint.Y);

        private void DrawBone(Skeleton skeleton, DrawingContext drawingContext, JointType jointType0, JointType jointType1)
            Joint joint0 = skeleton.Joints[jointType0];
            Joint joint1 = skeleton.Joints[jointType1];

            // If we can't find either of these joints, exit
            if (joint0.TrackingState == JointTrackingState.NotTracked ||
                joint1.TrackingState == JointTrackingState.NotTracked)

            // Don't draw if both points are inferred
            if (joint0.TrackingState == JointTrackingState.Inferred &&
                joint1.TrackingState == JointTrackingState.Inferred)

            // We assume all drawn bones are inferred unless BOTH joints are tracked
            Pen drawPen = this.inferredBonePen;
            if (joint0.TrackingState == JointTrackingState.Tracked && joint1.TrackingState == JointTrackingState.Tracked)
                drawPen = this.trackedBonePen;

            drawingContext.DrawLine(drawPen, this.SkeletonPointToScreen(joint0.Position), this.SkeletonPointToScreen(joint1.Position));

        private void CheckBoxSeatedModeChanged(object sender, RoutedEventArgs e)
            if (null != this.sensor)
                if (this.checkBoxSeatedMode.IsChecked.GetValueOrDefault())
                    this.sensor.SkeletonStream.TrackingMode = SkeletonTrackingMode.Seated;
                    this.sensor.SkeletonStream.TrackingMode = SkeletonTrackingMode.Default;

        private void CheckBoxConnectArduino(object sender, RoutedEventArgs e)

            if (this.checkArduinoConnected.IsChecked.GetValueOrDefault())
                string MyCOM = "COM10";
                ArduinoPort = new SerialPort(MyCOM, 9600);
                byte[] SequenceStart = { 254, 0, 0, 0, 0 };

                if (ArduinoPort.IsOpen)
                    PortOpen = true;
                    ArduinoPort.Write(SequenceStart, 0, 5);
                int count = ArduinoPort.BytesToRead;
                if (count == 2)
                    short Voltage;
                    double VoltageDouble;
                    byte[] SerialInBytes = { 0, 0 };
                    SerialInBytes[0] = Convert.ToByte(ArduinoPort.ReadByte());
                    SerialInBytes[1] = Convert.ToByte(ArduinoPort.ReadByte());
                    Voltage = BitConverter.ToInt16(SerialInBytes, 0);
                    VoltageDouble = Voltage * 10.653 / 1000;
                    string VoltageString = String.Format("{0:0.00}", VoltageDouble);
                    VoltageTextBox.Text = VoltageString;
                    System.Windows.Forms.MessageBox.Show("No Arduino connected");
                PortOpen = false;

Description: I have two classes: Angles and the mainClass. The class Angles calculates the angles. It is irrelevant. In the mainClass we start off with some variables. We also declare a speechEngine and the sensor.

Here comes the part I have problems with: As you can see I have an event handler, that calls a function every frame. Once it has entered this function (SensorSkeletonFrameReady) it continues in a loop of drawing skeletons and calling that function.

My question: Where and how should I implement the code for the speech recognition, which is already written?

I know it is a big chunk of code, but I am grateful for any advice! Thank you.

PS: Here's a picture of my robot, which uses the angles to imitate my arm movements.


  • I found my mistake. Apparently I have to start the sensor, then start the audio stream and then add the event handler for the skeleton tracking. It works now. Thank you.