Search code examples
pythonaudiopcmadpcm

Problem with converting ADPCM to wav in Python


I am trying to extract audio files from .fsb files used by Dragon Age: Origins. FSB means " FMOD Sample Bank" and the game uses FSB4. Because it is a proprietary format, Google is almost useless in finding relevant information about it, but I managed to find this program and this repository.

Looking at the files in a hex editor, it was easy for me to guess the structure of the format, it begins with a 48 byte header of the file, that starts with the 4 byte file signature "FSB4", then 4 byte number of audio files contained in the archive, then 4 byte size of header array for the individual files, 4 byte size of the actual file data, then some other stuff.

Then the header array for the files begins, each entry is almost always 80 bytes, it contains the name of the file, number of frames in the file, the data length of the file, and some other stuff.

Then the header array is followed by 16 null bytes, after that the actual data is stored contiguously head to tail, in the order they are listed.

There were some other things that I couldn't decode, and those were filled by the code from the aforementioned repository and testing with the GUI program.

With my reverse engineering I have written a 100% working program to extract audio files from the .fsb files, it works, but for some files the extracted audio is distorted.

The .fsb file used by the game stores audio in 3 formats: MPEG, PCM, and ADPCM. I know MPEG files are .mp3 files and they are stored with complete headers, so I just store the slices (self.data[start:end]) directly, and I know PCM files are just .wav files without headers, so I just use wave to add the appropriate header and write the data.

The problem is with ADPCM, I don't know how to convert ADPCM to PCM, and all the results I can find use audioop.adpcm2lin(adpcm, 2, None). They are all terribly outdated and the documentation says it is deprecated.

I use it in my code and it kind of works, but the audio extracted is way too fast, high pitched, and sounds mechanical. I don't know why.

Here is the code:

import audioop
import os
import struct
import wave
from pathlib import Path

class FSB4:
    HEADERS = (
        "signature",
        "file_count",
        "header_size",
        "data_size",
        "version",
        "flags",
        "padding",
        "hash"
    )
    ENTRY_HEADERS = (
        "frames",
        "data_size",
        "loop_start",
        "loop_end",
        "mode",
        "frequency",
        "pan",
        "defpri",
        "min_distance",
        "channels",
        "max_distance",
        "var_frequency",
        "var_vol",
        "var_pan"
    )
    def __init__(self, file):
        self.data = Path(file).read_bytes()
        self.parse_header()
        self.parse_entries()
    
    def parse_header(self):
        headers = self.data[:48]
        self.chunks = [headers]
        self.headers = dict(
            zip(
                self.HEADERS, 
                struct.unpack("<4s5I8s16s", headers)
            )
        )
    
    def parse_entry(self, data):
        chunks = struct.unpack("<H30s6I4H2f2I", data)
        self.chunks.append(data)
        flags = chunks[6]
        entry = dict(zip(
            self.ENTRY_HEADERS, 
            chunks[2:]
        ))
        entry["format"] = (
            "MPEG" if flags & 512 else (
                "PCM" if flags & 16 else "ADPCM"
            )
        )
        return (
            chunks[1].strip(b"\x00").decode(),
            entry
        )
    
    def parse_entries(self):
        count = self.headers["file_count"]
        offset = self.headers["header_size"] + 48
        self.entries = {}
        self.offsets = {}
        for i in range(48, 48 + count * 80, 80):
            name, entry = self.parse_entry(self.data[i:i+80])
            self.entries[name] = entry
            length = entry["data_size"]
            self.offsets[name] = (offset, offset + length)
            offset += length
        
        self.chunks.append(self.data[i+80:i+96])
    
    
    def extract_mp3(self, file, folder):
        filename = file.rsplit(".", 1)[0] + ".mp3"
        with open(os.path.join(folder, filename), "wb") as f:
            start, end = self.offsets[file]
            f.write(self.data[start:end])
    
    
    def extract(self, file, folder):
        entry = self.entries[file]
        audio_format = entry["format"]
        if audio_format == "MPEG":
            self.extract_mp3(file, folder)
        
        else:
            path = os.path.join(folder, file.rsplit(".", 1)[0] + ".wav")
            start, end = self.offsets[file]
            pcm = self.data[start:end]
            channels = entry["channels"]
            if audio_format == "ADPCM":
                pcm, _ = audioop.adpcm2lin(pcm, 2, None)

            with wave.open(path, "wb") as wav:
                frequency = entry["frequency"]
                frames = entry["frames"]
                wav.setparams((channels, 2, frequency, frames, 'NONE', 'NONE'))
                wav.writeframes(pcm)
    
    def extract_all(self, folder):
        for file in self.entries:
            self.extract(file, folder)

And a file to test with.

I can already pack sound files into an FSB4 file but I still can't fix the problem, how I can fix this?


Solution

  • I have solved this problem. It turns out the solution is extremely simple.

    By inspecting the .wav files extracted with wav.setparams for PCM files and the ADPCM files extracted by Aezay's FSB extractor I have determined that the correct solution is simply adding the appropriate header to the raw data stream, as I have found that all the working solutions simply add the header before the data.

    FSB extractor just adds a header for ADPCM wav files, but for whatever reason audioop.adpcm2lin changes the data stream and causes the problems.

    Now from my testing I have found that wav.setparams uses a 44 byte header for PCM files and FSB extractor uses a 60 byte header for ADPCM files. They begin with similar chunks but ultimately they diverge. Both differ from all the .wav header specifications I can find however, I have found about 5 of them, all of them are wrong, but some are close to the 44 byte header format used by the wave library.

    I spent a lot of time trying to reconcile the header formats I found by using Google and it was futile. The values simply don't line up perfectly. Eventually I gave up trying to follow the search results and instead I tried to reverse engineer the formats myself, and then I solved the problem.

    The header format for wav is as follows, all numbers are in little endian:

    • 0 b"RIFF"
    • 1 length of file + 4, in UInt32
    • 2 b"WAVEfmt "

    ADPCM uses a 60 byte header and PCM uses a 44 byte header, the length of file is simply the length of the data plus the length of the header. I have no idea why there is the plus 4 bit though.

    Now these header formats diverge, for ADPCM the next 6 bytes are always the same:

    • 3 b"\x14\x00\x00\x00\x11\x00"

    For PCM the next 4 bytes are always the same:

    • 3 b"\x10\x00\x00\x00\x01\x00"

    I have no idea what these bytes mean, but the first 4 bytes seem to be related to the length of the header.

    Then they converge for the next two chunks:

    • 4 number of channels UInt16
    • 5 frequency UInt32

    Then they diverge again.

    The next 4 bytes for PCM wav header is the byte rate, I have found it to be:

    • 6 frequency * size / samples UInt64

    I have no idea what these 4 bytes for ADPCM mean, I calculated the numbers and they simply don't equal the numbers I got at that position, but for whatever reason, from my tests with a hex editor, they don't seem to have an effect on whether or not the audio can be played successfully, so I just use b"\x00\x00\x00\x00" to fill it.

    The next chunk is evidently related to the number of channels, for PCM:

    • 7 channels * 2 UInt16

    For ADPCM:

    • 7 channels * 36 UInt16

    Now the last chunks, for PCM the last two chunks are:

    • 8 b"\x10\x00data"
    • 9 length of actual data UInt32

    For ADPCM, they are:

    • 8 b"\x04\x00\x02\x00\x00\x00fact\x04\x00\x00\x00"
    • 9 number of samples UInt32
    • 10 b"data"
    • 11 length of actual data UInt32

    Here is the code, I removed the wave and audioop imports because I don't need to use them. I also fixed the problem where if an entry header has more than 80 bytes it will break the code.

    import os
    import struct
    from pathlib import Path
    
    
    class FSB4:
        HEADERS = (
            "signature",
            "file_count",
            "header_size",
            "data_size",
            "version",
            "flags",
            "padding",
            "hash"
        )
        ENTRY_HEADERS = (
            "frames",
            "data_size",
            "loop_start",
            "loop_end",
            "mode",
            "frequency",
            "pan",
            "defpri",
            "min_distance",
            "channels",
            "max_distance",
            "var_frequency",
            "var_vol",
            "var_pan"
        )
        
        @staticmethod
        def adpcm_wav_header(info):
            return (
            b"RIFF" + 
            (info["data_size"] + 64).to_bytes(4, "little") + 
            b"WAVEfmt " + 
            b"\x14\x00\x00\x00\x11\x00" + 
            (channels := info["channels"]).to_bytes(2, "little") + 
            info["frequency"].to_bytes(4, "little") +
            b"\x00\x00\x00\x00" +
            (channels * 36).to_bytes(2, "little") + 
            b"\x04\x00\x02\x00\x00\x00fact\x04\x00\x00\x00" +
            info["frames"].to_bytes(4, "little") + 
            b"data" +
            info["data_size"].to_bytes(4, "little")
        )
    
    
        @staticmethod
        def wav_header(info):
            return (
                b"RIFF" + 
                ((size := info["data_size"]) + 48).to_bytes(4, "little") + 
                b"WAVEfmt " + 
                b"\x10\x00\x00\x00\x01\x00" + 
                (channels := info["channels"]).to_bytes(2, "little") + 
                (freq := info["frequency"]).to_bytes(4, "little") +
                (freq * size // info["frames"]).to_bytes(4, "little") +
                (channels * 2).to_bytes(2, "little") +
                b"\x10\x00data" +
                info["data_size"].to_bytes(4, "little")
            )
        
        header_formatter = {
            "PCM": wav_header,
            "ADPCM": adpcm_wav_header
        }
        
        def __init__(self, file):
            self.data = Path(file).read_bytes()
            self.parse_header()
            self.parse_entries()
        
        def parse_header(self):
            headers = self.data[:48]
            self.chunks = [headers]
            self.headers = dict(
                zip(
                    self.HEADERS, 
                    struct.unpack("<4s5I8s16s", headers)
                )
            )
        
        def parse_entry(self, data):
            chunks = struct.unpack("<30s6I4H2f2I", data)
            flags = chunks[5]
            entry = dict(zip(
                self.ENTRY_HEADERS, 
                chunks[1:]
            ))
            entry["format"] = (
                "MPEG" if flags & 512 else (
                    "PCM" if flags & 16 else "ADPCM"
                )
            )
            return (
                chunks[0].strip(b"\x00").decode(),
                entry,
            )
        
        def parse_entries(self):
            count = self.headers["file_count"]
            offset = self.headers["header_size"] + 48
            self.entries = {}
            self.offsets = {}
            pos = 48
            for _ in range(count):
                length = int.from_bytes(self.data[pos:pos+2], "little")
                chunk = self.data[pos:pos+length]
                self.chunks.append(chunk)
                name, entry = self.parse_entry(chunk[2:80])
                if length > 80:
                    entry["extra"] = chunk[80:]
                
                self.entries[name] = entry
                size = entry["data_size"]
                self.offsets[name] = (offset, offset + size)
                pos += length
                offset += size
            
            self.chunks.append(self.data[pos+80:pos+96])
         
        def extract(self, file, folder):
            entry = self.entries[file]
            audio_format = entry["format"]
            ext = ".mp3" if audio_format == "MPEG" else ".wav"
            filename = file.rsplit(".", 1)[0] + ext
            with open(os.path.join(folder, filename), "wb") as f:
                if formatter := self.header_formatter.get(audio_format):
                    f.write(formatter(entry))
                
                start, end = self.offsets[file]
                f.write(self.data[start:end])
        
        def extract_all(self, folder):
            for file in self.entries:
                self.extract(file, folder)