Search code examples
jsonswiftprotocol-buffersflatbuffers

Swift: Most performant way of parsing 10k lines of data?


So my app is currently reading 10k lines into a variable and then using SwiftyJson to parse it into realm.

Source: https://github.com/skishore/makemeahanzi/blob/master/graphics.txt https://github.com/skishore/makemeahanzi/blob/master/dictionary.txt

Problem: It takes way too long: 2:28minutes. Also it takes 400mb of memory!

Question: How to make this faster? Any of you having experience with Flatbuffers or Protobuf?

Help would be very welcome!

Cheers, Dom


This is the code:

func parseToRealm() {

    // each of these files have 9500+ lines of data
    // (basically dictionaries with word definitions)
    let graphicsFileContents = readFile_Graphics()
    let dictFileContents = readFile_Dict()

    // check if counts of two source files match
    if (graphicsFileContents.count == dictFileContents.count && graphicsFileContents.count > 1 && dictFileContents.count > 1) {
        var i = 0
        // make empty array of characters
        var characterArr = [Characters()]

        // loop through two files to get all chars
        for jsonString in graphicsFileContents {
            // parse data from string into json
            let dataFromString = jsonString.data(using: .utf8)
            let singleCharJson = try? JSON(data: dataFromString!)


            // parse stuff from file1
            // ... deleted lines for legal reasons


            // DICT information
            let dictDataFromString = dictFileContents[i].data(using: .utf8)
            let singleDictJson = try? JSON(data: dictDataFromString!)

            // parse stuff from that dictionary
            // ... deleted lines for legal reasons

            characterArr.append(Character)

            // Every x characters, write them into DB
            if (i % 150 == 0 || i == graphicsFileContents.count){
                realmActions.writeCharsToRealm(characterArr: characterArr)
                print("Writing \(i)-\(i + 150)")
                // reset array to safe memory
                characterArr = [Characters()]
            }
            i+=1
        } // end loop file contents
    }else{
        print ("two files have different counts of lines. aborting...")
    }
}


// read graphics file and return all contents as array of strings
        // return Array of Strings
        func readFile_Graphics () -> [String] {
            // predeclare emtpy string array
            var myStrings = [String]()
            if let path = Bundle.main.path(forResource: "graphics", ofType: "txt") {
                do {
                    let data = try String(contentsOfFile: path, encoding: .utf8)
                     myStrings = data.components(separatedBy: .newlines)
                } catch {
                    print("cannot get file graphics.txt. Error message:")
                    print(error)
                }
            }
            return myStrings
        }



// read dictionary file and return all contents as array of strings
    func readFile_Dict () -> [String]{
        var myStrings = [""]
        if let path = Bundle.main.path(forResource: "dictionary", ofType: "txt") {
            do {
                let data = try String(contentsOfFile: path, encoding: .utf8)
                myStrings = data.components(separatedBy: .newlines)
            } catch {
                print("cannot get file dictionary.txt. Error message:")
                print(error)
            }
        }
    return myStrings
    }

Solution

  • DispatchQueue.global(qos: .background).async {
                guard let path = Bundle.main.path(forResource: "graphics", ofType: "txt") else {
                    print("Dang! File wasn't found!")
                    return
                }
                let cal = Calendar.current
                let d1 = Date()
    
                guard let streamReader = StreamReader(path: path) else {
                    print("Dang! StreamReader couldn't be created!")
                    return
                }
                var counter = 0
                while !streamReader.atEof {
                    guard let nextLine = streamReader.nextLine() else {
                        print("Oops! Reached the end before printing!")
                        break
                    }
                    let json = JSON(parseJSON: nextLine)
    
                    counter += 1
                    print("\(counter): \(nextLine)")
                }
                let d2 = Date() 
                let components = cal.dateComponents([.minute], from: d2, to: d1)
                print("Diff: \(components.minute!)")
            }
        }
    

    Stream Reader class

    import Foundation
    
    class StreamReader  {
    
        let encoding : String.Encoding
        let chunkSize : Int
        var fileHandle : FileHandle!
        let delimData : Data
        var buffer : Data
        var atEof : Bool
    
        init?(path: String, delimiter: String = "\n", encoding: String.Encoding = .utf8,
              chunkSize: Int = 4096) {
    
            guard let fileHandle = FileHandle(forReadingAtPath: path),
                let delimData = delimiter.data(using: encoding) else {
                    return nil
            }
            self.encoding = encoding
            self.chunkSize = chunkSize
            self.fileHandle = fileHandle
            self.delimData = delimData
            self.buffer = Data(capacity: chunkSize)
            self.atEof = false
        }
    
        deinit {
            self.close()
        }
    
        /// Return next line, or nil on EOF.
        func nextLine() -> String? {
            precondition(fileHandle != nil, "Attempt to read from closed file")
    
            // Read data chunks from file until a line delimiter is found:
            while !atEof {
                if let range = buffer.range(of: delimData) {
                    // Convert complete line (excluding the delimiter) to a string:
                    let line = String(data: buffer.subdata(in: 0..<range.lowerBound), encoding: encoding)
                    // Remove line (and the delimiter) from the buffer:
                    buffer.removeSubrange(0..<range.upperBound)
                    return line
                }
                let tmpData = fileHandle.readData(ofLength: chunkSize)
                if tmpData.count > 0 {
                    buffer.append(tmpData)
                } else {
                    // EOF or read error.
                    atEof = true
                    if buffer.count > 0 {
                        // Buffer contains last line in file (not terminated by delimiter).
                        let line = String(data: buffer as Data, encoding: encoding)
                        buffer.count = 0
                        return line
                    }
                }
            }
            return nil
        }
    
        /// Start reading from the beginning of file.
        func rewind() -> Void {
            fileHandle.seek(toFileOffset: 0)
            buffer.count = 0
            atEof = false
        }
    
        /// Close the underlying file. No reading must be done after calling this method.
        func close() -> Void {
            fileHandle?.closeFile()
            fileHandle = nil
        }
    }
    
    extension StreamReader : Sequence {
        func makeIterator() -> AnyIterator<String> {
            return AnyIterator {
                return self.nextLine()
            }
        }
    }
    

    The StreamReader class reads the text file line by line so no need to read the whole file at once. the first block reads the content of the file. Try the above code. this should solve your problem. please note that I've used background thread whereas the realm doesn't work on background thread (AFAIK). let me know if that helps.