Search code examples
goaudiocaf

get duration of caf audio file with go lang


i'm wanting to get the duration of .caf audio files using go. I found a few decoders but their Duration() methods just return 0 with comments perhaps suggesting ways of calculating the duration, does any know if these comments are legit and if so, how I might calculate the duration? I'll accept "it's not possible" as an answer if there's no easy solution.

func (d *Decoder) Duration() time.Duration {
    //duration := time.Duration((float64(p.Size) / float64(p.AvgBytesPerSec)) * float64(time.Second))
    //duration := time.Duration(float64(p.NumSampleFrames) / float64(p.SampleRate) * float64(time.Second))

    return 0
}

one implementation example although i'm happy to use any implementation that's easy to install: https://github.com/mattetti/audio/blob/master/caf/decoder.go


Solution

  • The doc comments in that file you linked is taken directly from Apple's spec. In those docs, you'll find these two important things:

    "The duration of the audio in the file is [the number of valid frames] divided by the sample rate specified in the file’s Audio Description chunk."

    OK, cool, but how many valid frames are there? There are two possible ways to know:

    • If the CAF has a packet table, it must include the number of valid frames. Perfect.
    • The only CAFs that are allowed to NOT have a packet table are those with constant packet sizes:

    "Note that, as long as the format has a constant number of frames per packet, you can calculate the duration of each packet by dividing the mSampleRate [frames per second] value by the mFramesPerPacket value."

    That tells you the duration per packet, but because packets are a constant size, the number of packets is just the audioDataSize / bytesPerPacket. The latter value is included in the Audio Description. The former is often embedded directly into the file, but it's permitted to be -1 with the audio data as the last chunk, in which case its size is totalFileSize - startOfAudioData

    It breaks down like this:

    • If there's a Packet Table Chunk, use it and the Audio Description: seconds = validFrames / sampleRate
    • Otherwise, packets must have constant size:
      • framesPerByte = framesPerPacket / bytesPerPacket
      • seconds = framesPerByte * audioDataSize

    The library you've got reads the Audio Description chunk, but I don't think it reads the Packet Table. Also, I'm not confident it calculates the audio data size if the chunk is -1. Maybe it does both/either, in which case, you can use the information above.

    If not, you can just parse the file yourself, especially if you only care about the duration. The file starts with a short header, then is split into "chunks" (aka TLVs). Here's a sample implementation you can use as a starting point or to modify the library you linked:


    
    func readCAF() { 
        buf := []byte{
            // file header
            'c', 'a', 'f', 'f', // file type
            0x0, 0x1, 0x0, 0x0, // file version, flags
    
            // audio description
            'd', 'e', 's', 'c', // chunk type
            0x0, 0x0, 0x0, 0x0,
            0x0, 0x0, 0x0, 0x20, // CAFAudioFormat size
    
            0x40, 0xe5, 0x88, 0x80,
            0x00, 0x00, 0x00, 0x00, // sample rate
            'l', 'p', 'c', 'm', // fmt id
            0x0, 0x0, 0x0, 0x0, // fmt flags
            0x0, 0x0, 0x0, 0x1, // bytes per packet
            0x0, 0x0, 0x0, 0x1, // frames per packet
            0x0, 0x0, 0x0, 0x2, // channels per frame
            0x0, 0x0, 0x0, 0x3, // bits per channel
    
            // audio data
            'd', 'a', 't', 'a', // chunk type
            0xff, 0xff, 0xff, 0xff,
            0xff, 0xff, 0xff, 0xff, // size of data section (-1 = til EOF)
    
            // actual audio packets (in theory, anyway)
            0x0,
            0x0,
            0x0,
            0x0,
            0x0,
            0x0,
        }
    
        fileSize := len(buf)
        br := bufio.NewReader(bytes.NewBuffer(buf))
    
        type cafHdr struct {
            Typ     [4]byte
            Version uint16
            _       uint16
        }
    
        type chunkHdr struct {
            Typ [4]byte
            Sz  int64
        }
    
        type audioDescription struct {
            FramesPerSec     float64
            FmtId            uint32
            FmtFlags         uint32
            BytesPerPacket   uint32
            FramesPerPacket  uint32
            ChannelsPerFrame uint32
            BitsPerChannel   uint32
        }
    
        type packetTable struct {
            NPackets, NValidFrames, NPrimingFr, NRemainingFr int64
        }
    
        const FileHeaderSz = 8
        const ChunkHeaderSz = 12
        const AudioDescSz = 32
        const PacketHdrSz = 24
    
        fileHdr := cafHdr{}
        if err := binary.Read(br, binary.BigEndian, &fileHdr); err != nil {
            panic(err)
        }
        if fileHdr.Typ != [4]byte{'c', 'a', 'f', 'f'} || fileHdr.Version != 1 {
            panic("unknown file format")
        }
        remaining := int64(fileSize) - FileHeaderSz
    
        audioDesc := audioDescription{}
        packetTab := packetTable{}
        var audioDataSz int64
    
    readChunks:
        for {
            hdr := chunkHdr{}
            if err := binary.Read(br, binary.BigEndian, &hdr); err != nil {
                panic(err)
            }
            remaining -= ChunkHeaderSz
    
            switch hdr.Typ {
            case [4]byte{'d', 'e', 's', 'c'}: // audio description
                if err := binary.Read(br, binary.BigEndian, &audioDesc); err != nil {
                    panic(err)
                }
                hdr.Sz -= AudioDescSz
                remaining -= AudioDescSz
    
            case [4]byte{'p', 'a', 'k', 't'}: // packet table
                if err := binary.Read(br, binary.BigEndian, &packetTab); err != nil {
                    panic(err)
                }
                hdr.Sz -= PacketHdrSz
                remaining -= PacketHdrSz
    
            case [4]byte{'d', 'a', 't', 'a'}: // audio data
                if hdr.Sz > 0 {
                    audioDataSz = hdr.Sz
                } else if hdr.Sz == -1 {
                    // if needed, read to EOF to determine byte size
                    audioDataSz = remaining
                    break readChunks
                }
            }
    
            if hdr.Sz < 0 {
                panic("invalid header size")
            }
            remaining -= hdr.Sz
    
            // Skip to the next chunk. On 32 bit machines, Sz can overflow,
            // so you should check for that (or use Seek if you're reading a file).
            if n, err := br.Discard(int(hdr.Sz)); err != nil {
                if err == io.EOF && int64(n) == hdr.Sz {
                    break
                }
                panic(err)
            }
        }
    
        var seconds float64
    
        // If the data included a packet table, the frames determines duration.
        if packetTab.NValidFrames > 0 {
            seconds = float64(packetTab.NValidFrames) / audioDesc.FramesPerSec
        } else {
            // If there no packet table, it must have a constant packet size.
            if audioDesc.BytesPerPacket == 0 || audioDesc.FramesPerPacket == 0 {
                panic("bad data")
            }
            framesPerByte := float64(audioDesc.FramesPerPacket) / float64(audioDesc.BytesPerPacket)
            seconds = framesPerByte * float64(audioDataSz)
        }
    
        fmt.Printf("seconds: %f\n", seconds)
    }