For debug reasons, I need to save the content of a big MLMultiarray as a file.
I need an accuracy of up to 0.0001. Saving the entire matrix as a JSON file is impossible as it is too big (1.5G). I Assume that a binary representation or a compressed representation will be x5 or x20 smaller. As this is a debug code it can be somewhat slow.
Any idea?
Instead of encoding the huge MLMultiArray which cause a memory overloading, you can iterate over it and write it as stream to a file in JSON format:
First you should add FileStreamer.swift which responsible for writing:
import Foundation
class FileStreamer {
private let fileURL: URL
let fileHandle: FileHandle
init(fileURL: URL) {
self.fileURL = fileURL
if FileManager.default.fileExists(atPath: fileURL.path) {
try? FileManager.default.removeItem(at: fileURL)
}
FileManager.default.createFile(atPath: fileURL.path, contents: nil, attributes: nil)
self.fileHandle = FileHandle(forWritingAtPath: fileURL.path)!
}
func write(_ string: String) {
fileHandle.seekToEndOfFile()
if let data = string.data(using: .utf8) {
fileHandle.write(data)
}
}
func close() {
try? fileHandle.close()
}
}
Then you can add MLMultiArray helper which embeds the MLMultiArray and has the ability to write it as a JSON format.
//
// MultiArray.swift
//
// Created by Tal Sahar on 16/05/2022.
//
import Foundation
import CoreML
import Swift
public protocol MultiArrayType: Comparable {
static var multiArrayDataType: MLMultiArrayDataType { get }
static func +(lhs: Self, rhs: Self) -> Self
static func -(lhs: Self, rhs: Self) -> Self
static func *(lhs: Self, rhs: Self) -> Self
static func /(lhs: Self, rhs: Self) -> Self
init(_: Int)
var toUInt8: UInt8 { get }
}
extension Double: MultiArrayType {
public static var multiArrayDataType: MLMultiArrayDataType { return .double }
public var toUInt8: UInt8 { return UInt8(self) }
}
extension Float: MultiArrayType {
public static var multiArrayDataType: MLMultiArrayDataType { return .float32 }
public var toUInt8: UInt8 { return UInt8(self) }
}
extension Int32: MultiArrayType {
public static var multiArrayDataType: MLMultiArrayDataType { return .int32 }
public var toUInt8: UInt8 { return UInt8(self) }
}
/**
Wrapper around MLMultiArray to make it more Swifty.
*/
public struct MultiArray<T: MultiArrayType> {
public let array: MLMultiArray
public let pointer: UnsafeMutablePointer<T>
private(set) public var strides: [Int]
private(set) public var shape: [Int]
/**
Creates a new multi-array filled with all zeros.
*/
public init(shape: [Int]) {
let m = try! MLMultiArray(shape: shape as [NSNumber], dataType: T.multiArrayDataType)
self.init(m)
memset(pointer, 0, MemoryLayout<T>.stride * count)
}
/**
Creates a new multi-array initialized with the specified value.
*/
public init(shape: [Int], initial: T) {
self.init(shape: shape)
for i in 0..<count {
pointer[i] = initial
}
}
/**
Creates a multi-array that wraps an existing MLMultiArray.
*/
public init(_ array: MLMultiArray) {
self.init(array, array.shape as! [Int], array.strides as! [Int])
}
init(_ array: MLMultiArray, _ shape: [Int], _ strides: [Int]) {
self.array = array
self.shape = shape
self.strides = strides
pointer = UnsafeMutablePointer<T>(OpaquePointer(array.dataPointer))
}
/**
Returns the number of elements in the entire array.
*/
public var count: Int {
return shape.reduce(1, *)
}
public subscript(a: Int) -> T {
get { return pointer[a] }
set { pointer[a] = newValue }
}
public subscript(a: Int, b: Int) -> T {
get { return pointer[a*strides[0] + b*strides[1]] }
set { pointer[a*strides[0] + b*strides[1]] = newValue }
}
public subscript(a: Int, b: Int, c: Int) -> T {
get { return pointer[a*strides[0] + b*strides[1] + c*strides[2]] }
set { pointer[a*strides[0] + b*strides[1] + c*strides[2]] = newValue }
}
public subscript(a: Int, b: Int, c: Int, d: Int) -> T {
get { return pointer[a*strides[0] + b*strides[1] + c*strides[2] + d*strides[3]] }
set { pointer[a*strides[0] + b*strides[1] + c*strides[2] + d*strides[3]] = newValue }
}
public subscript(a: Int, b: Int, c: Int, d: Int, e: Int) -> T {
get { return pointer[a*strides[0] + b*strides[1] + c*strides[2] + d*strides[3] + e*strides[4]] }
set { pointer[a*strides[0] + b*strides[1] + c*strides[2] + d*strides[3] + e*strides[4]] = newValue }
}
public subscript(indices: [Int]) -> T {
get { return pointer[offset(for: indices)] }
set { pointer[offset(for: indices)] = newValue }
}
func offset(for indices: [Int]) -> Int {
var offset = 0
for i in 0..<indices.count {
offset += indices[i] * strides[i]
}
return offset
}
}
extension MultiArray {
func write(to stream: FileStreamer) {
write(to: stream, [])
}
private func write(to stream: FileStreamer, _ indices: [Int]) {
// This function is called recursively for every dimension.
// Add an entry for this dimension to the end of the array.
var indices = indices + [0]
let d = indices.count - 1 // the current dimension
let N = shape[d] // how many elements in this dimension
stream.write("[")
if indices.count < shape.count { // not last dimension yet?
for i in 0..<N {
indices[d] = i
write(to: stream, indices)
if i != N - 1 {
stream.write(",")
}
}
} else { // the last dimension has actual data
for i in 0..<N {
indices[d] = i
stream.write("\(self[indices])")
if i != N - 1 { // not last element?
stream.write(",")
}
}
}
stream.write("]")
}
}
extension MultiArray: CustomStringConvertible {
public var description: String {
return description([])
}
func description(_ indices: [Int]) -> String {
func indent(_ x: Int) -> String {
return String(repeating: " ", count: x)
}
// This function is called recursively for every dimension.
// Add an entry for this dimension to the end of the array.
var indices = indices + [0]
let d = indices.count - 1 // the current dimension
let N = shape[d] // how many elements in this dimension
var s = "["
if indices.count < shape.count { // not last dimension yet?
for i in 0..<N {
indices[d] = i
s += description(indices) // then call recursively again
if i != N - 1 {
s += ",\n" + indent(d + 1)
}
}
} else { // the last dimension has actual data
s += " "
for i in 0..<N {
indices[d] = i
s += "\(self[indices])"
if i != N - 1 { // not last element?
s += ", "
if i % 11 == 10 { // wrap long lines
s += "\n " + indent(d + 1)
}
}
}
s += " "
}
return s + "]"
}
}
Now you can write any size of MLMultiArray to a file while keeping your memory usage constant.
Usage:
let descriptorsFileStream = FileStreamer(fileURL: <#PATH#>)
descriptors.writeCompressed(to: descriptorsFileStream)
descriptorsFileStream.close()