I have a React Native (Expo) app which captures audio using the expo-av
library.
It then uploads the audio file to Amazon S3, and then Transcribes that in Amazon Transcribe.
For Android , i save the audio as a '.m4a' file, and call the Amazon Transcribe API as :
transcribe_client.start_transcription_job(TranscriptionJobName = job_name,
Media={'MediaFileUri' : file_uri},
MediaFormat='mp4',
LanguageCode='en-US')
What should the 'MediaFormat' be for upload from an iOS device, which will typically be a '.caf' file ?
Amazon Transcribe only allows these media formats
MP3, MP4, WAV, FLAC, AMR, OGG, and WebM
Possible solutions:
Create an API wich does the conversion for you.
You can easly create one using for example the FFMPEG python library.
Use an already made API.
By using the cloudconvert API you can convert the file with ease, but only if you pay for it.
Use an different library to record the IOS audio.
There's an module called react-native-record-audio-ios wich is made entirely for IOS and record audio in .caf
, .m4a
, and .wav
.
Use the LAME api to convert it.
As said here, you can convert a .caf
file into a .mp3
one by probably creating a native module wich would run this:
FILE *pcm = fopen("file.caf", "rb");
FILE *mp3 = fopen("file.mp3", "wb");
const int PCM_SIZE = 8192;
const int MP3_SIZE = 8192;
short int pcm_buffer[PCM_SIZE*2];
unsigned char mp3_buffer[MP3_SIZE];
lame_t lame = lame_init();
lame_set_in_samplerate(lame, 44100);
lame_set_VBR(lame, vbr_default);
lame_init_params(lame);
do {
read = fread(pcm_buffer, 2*sizeof(short int), PCM_SIZE, pcm);
if (read == 0)
write = lame_encode_flush(lame, mp3_buffer, MP3_SIZE);
else
write = lame_encode_buffer_interleaved(lame, pcm_buffer, read, mp3_buffer, MP3_SIZE);
fwrite(mp3_buffer, write, 1, mp3);
} while (read != 0);
lame_close(lame);
fclose(mp3);
fclose(pcm);
objective-c
code:-(void) convertToWav
{
// set up an AVAssetReader to read from the iPod Library
NSString *cafFilePath=[[NSBundle mainBundle]pathForResource:@"test" ofType:@"caf"];
NSURL *assetURL = [NSURL fileURLWithPath:cafFilePath];
AVURLAsset *songAsset = [AVURLAsset URLAssetWithURL:assetURL options:nil];
NSError *assetError = nil;
AVAssetReader *assetReader = [AVAssetReader assetReaderWithAsset:songAsset
error:&assetError]
;
if (assetError) {
NSLog (@"error: %@", assetError);
return;
}
AVAssetReaderOutput *assetReaderOutput = [AVAssetReaderAudioMixOutput
assetReaderAudioMixOutputWithAudioTracks:songAsset.tracks
audioSettings: nil];
if (! [assetReader canAddOutput: assetReaderOutput]) {
NSLog (@"can't add reader output... die!");
return;
}
[assetReader addOutput: assetReaderOutput];
NSString *title = @"MyRec";
NSArray *docDirs = NSSearchPathForDirectoriesInDomains (NSDocumentDirectory, NSUserDomainMask, YES);
NSString *docDir = [docDirs objectAtIndex: 0];
NSString *wavFilePath = [[docDir stringByAppendingPathComponent :title]
stringByAppendingPathExtension:@"wav"];
if ([[NSFileManager defaultManager] fileExistsAtPath:wavFilePath])
{
[[NSFileManager defaultManager] removeItemAtPath:wavFilePath error:nil];
}
NSURL *exportURL = [NSURL fileURLWithPath:wavFilePath];
AVAssetWriter *assetWriter = [AVAssetWriter assetWriterWithURL:exportURL
fileType:AVFileTypeWAVE
error:&assetError];
if (assetError)
{
NSLog (@"error: %@", assetError);
return;
}
AudioChannelLayout channelLayout;
memset(&channelLayout, 0, sizeof(AudioChannelLayout));
channelLayout.mChannelLayoutTag = kAudioChannelLayoutTag_Stereo;
NSDictionary *outputSettings = [NSDictionary dictionaryWithObjectsAndKeys:
[NSNumber numberWithInt:kAudioFormatLinearPCM], AVFormatIDKey,
[NSNumber numberWithFloat:44100.0], AVSampleRateKey,
[NSNumber numberWithInt:2], AVNumberOfChannelsKey,
[NSData dataWithBytes:&channelLayout length:sizeof(AudioChannelLayout)], AVChannelLayoutKey,
[NSNumber numberWithInt:16], AVLinearPCMBitDepthKey,
[NSNumber numberWithBool:NO], AVLinearPCMIsNonInterleaved,
[NSNumber numberWithBool:NO],AVLinearPCMIsFloatKey,
[NSNumber numberWithBool:NO], AVLinearPCMIsBigEndianKey,
nil];
AVAssetWriterInput *assetWriterInput = [AVAssetWriterInput assetWriterInputWithMediaType:AVMediaTypeAudio
outputSettings:outputSettings];
if ([assetWriter canAddInput:assetWriterInput])
{
[assetWriter addInput:assetWriterInput];
}
else
{
NSLog (@"can't add asset writer input... die!");
return;
}
assetWriterInput.expectsMediaDataInRealTime = NO;
[assetWriter startWriting];
[assetReader startReading];
AVAssetTrack *soundTrack = [songAsset.tracks objectAtIndex:0];
CMTime startTime = CMTimeMake (0, soundTrack.naturalTimeScale);
[assetWriter startSessionAtSourceTime: startTime];
__block UInt64 convertedByteCount = 0;
dispatch_queue_t mediaInputQueue = dispatch_queue_create("mediaInputQueue", NULL);
[assetWriterInput requestMediaDataWhenReadyOnQueue:mediaInputQueue
usingBlock: ^
{
while (assetWriterInput.readyForMoreMediaData)
{
CMSampleBufferRef nextBuffer = [assetReaderOutput copyNextSampleBuffer];
if (nextBuffer)
{
// append buffer
[assetWriterInput appendSampleBuffer: nextBuffer];
convertedByteCount += CMSampleBufferGetTotalSampleSize (nextBuffer);
CMTime progressTime = CMSampleBufferGetPresentationTimeStamp(nextBuffer);
CMTime sampleDuration = CMSampleBufferGetDuration(nextBuffer);
if (CMTIME_IS_NUMERIC(sampleDuration))
progressTime= CMTimeAdd(progressTime, sampleDuration);
float dProgress= CMTimeGetSeconds(progressTime) / CMTimeGetSeconds(songAsset.duration);
NSLog(@"%f",dProgress);
}
else
{
[assetWriterInput markAsFinished];
// [assetWriter finishWriting];
[assetReader cancelReading];
}
}
}];
}
But, as said here:
Since the iPhone shouldn't really be used for processor intensive things such as audio conversion.
So i recommend you the third solution, because it's easier and doesn't look like an intensive task for the Iphone processor.