How to detect speech start on iOS Speech API

I would recommend low-pass filtering the power signal using AVAudioRecorder and a NSTimer for callback. This way you'll be able to detect when a certain threshold is reached in the audio recorder readings, and the low-pass filtering will help mitigating the noise.

In the .h file:

#import <UIKit/UIKit.h>
#import <AVFoundation/AVFoundation.h>
#import <CoreAudio/CoreAudioTypes.h>

@interface ViewController : UIViewController{
    AVAudioRecorder *recorder;
    NSTimer *levelTimer;
    double lowPassResults;

- (void)levelTimerCallback:(NSTimer *)timer;

In the .m file:

#import "ViewController.h"

@interface ViewController ()


@implementation ViewController

- (void)viewDidLoad {
    [super viewDidLoad];

    // AVAudioSession already set in your code, so no need for these 2 lines.
    [[AVAudioSession sharedInstance] setCategory:AVAudioSessionCategoryPlayAndRecord error:nil];
    [[AVAudioSession sharedInstance] setActive:YES error:nil];

    NSURL *url = [NSURL fileURLWithPath:@"/dev/null"];

    NSDictionary *settings = [NSDictionary dictionaryWithObjectsAndKeys:
                              [NSNumber numberWithFloat: 44100.0],                 AVSampleRateKey,
                              [NSNumber numberWithInt: kAudioFormatAppleLossless], AVFormatIDKey,
                              [NSNumber numberWithInt: 1],                         AVNumberOfChannelsKey,
                              [NSNumber numberWithInt: AVAudioQualityMax],         AVEncoderAudioQualityKey,

    NSError *error;

    lowPassResults = 0;

    recorder = [[AVAudioRecorder alloc] initWithURL:url settings:settings error:&error];

    if (recorder) {
        [recorder prepareToRecord];
        recorder.meteringEnabled = YES;
        [recorder record];
        levelTimer = [NSTimer scheduledTimerWithTimeInterval: 0.05 target: self selector: @selector(levelTimerCallback:) userInfo: nil repeats: YES];
    } else
        NSLog(@"%@", [error description]);

- (void)levelTimerCallback:(NSTimer *)timer {
    [recorder updateMeters];

    const double ALPHA = 0.05;
    double peakPowerForChannel = pow(10, (0.05 * [recorder peakPowerForChannel:0]));
    lowPassResults = ALPHA * peakPowerForChannel + (1.0 - ALPHA) * lowPassResults;  

    NSLog(@"lowPassResults: %f",lowPassResults);

    // Use here a threshold value to stablish if there is silence or speech
    if (lowPassResults < 0.1) {
    } else if(lowPassResults > 0.5){


- (void)didReceiveMemoryWarning {
    [super didReceiveMemoryWarning];
    // Dispose of any resources that can be recreated.


This is the code we ended up with that works.

The key thing was to installTapOnBus() and then the magic code to detect the volume,

float volume = fabsf(*buffer.floatChannelData[0]);

-(void) doActualRecording {

    @try {
    //if (!recording) {
        if (audioEngine != NULL) {
            [audioEngine stop];
            [speechTask cancel];
            AVAudioInputNode* inputNode = [audioEngine inputNode];
            [inputNode removeTapOnBus: 0];

        recording = YES;
        micButton.selected = YES;

        //NSLog(@"Starting recording...   SFSpeechRecognizer Available? %d", [speechRecognizer isAvailable]);
        NSError * outError;
        //NSLog(@"AUDIO SESSION CATEGORY0: %@", [[AVAudioSession sharedInstance] category]);
        AVAudioSession* audioSession = [AVAudioSession sharedInstance];
        [audioSession setCategory: AVAudioSessionCategoryPlayAndRecord withOptions:AVAudioSessionCategoryOptionDefaultToSpeaker error:&outError];
        [audioSession setMode: AVAudioSessionModeMeasurement error:&outError];
        [audioSession setActive: true withOptions: AVAudioSessionSetActiveOptionNotifyOthersOnDeactivation error:&outError];

        SFSpeechAudioBufferRecognitionRequest* speechRequest = [[SFSpeechAudioBufferRecognitionRequest alloc] init];
        //NSLog(@"AUDIO SESSION CATEGORY1: %@", [[AVAudioSession sharedInstance] category]);
        if (speechRequest == nil) {
            NSLog(@"Unable to create SFSpeechAudioBufferRecognitionRequest.");

        speechDetectionSamples = 0;

        // This some how fixes a crash on iPhone 7
        // Seems like a bug in iOS ARC/lack of gc
        AVAudioEngine* temp = audioEngine;
        audioEngine = [[AVAudioEngine alloc] init];
        AVAudioInputNode* inputNode = [audioEngine inputNode];

        speechRequest.shouldReportPartialResults = true;

        // iOS speech does not detect end of speech, so must track silence.
        lastSpeechDetected = -1;

        speechTask = [speechRecognizer recognitionTaskWithRequest: speechRequest delegate: self];

        [inputNode installTapOnBus:0 bufferSize: 4096 format: [inputNode outputFormatForBus:0] block:^(AVAudioPCMBuffer* buffer, AVAudioTime* when) {
            @try {
                long long millis = [[NSDate date] timeIntervalSince1970] * 1000;
                if (lastSpeechDetected != -1 && ((millis - lastSpeechDetected) > 1000)) {
                    lastSpeechDetected = -1;
                    [speechTask finish];
                [speechRequest appendAudioPCMBuffer: buffer];

                //Calculate volume level
                if ([buffer floatChannelData] != nil) {
                    float volume = fabsf(*buffer.floatChannelData[0]);

                    if (volume >= speechDetectionThreshold) {

                        if (speechDetectionSamples >= speechDetectionSamplesNeeded) {

                            //Need to change mic button image in main thread
                            [[NSOperationQueue mainQueue] addOperationWithBlock:^ {

                                [micButton setImage: [UIImage imageNamed: @"micRecording"] forState: UIControlStateSelected];

                    } else {
                        speechDetectionSamples = 0;
            @catch (NSException * e) {
                NSLog(@"Exception: %@", e);

        [audioEngine prepare];
        [audioEngine startAndReturnError: &outError];
        NSLog(@"Error %@", outError);
    @catch (NSException * e) {
        NSLog(@"Exception: %@", e);