Search code examples
pythonhuggingface-transformerscoreml

How can I export a tokenizer from Huggingface transformers to CoreML?


I load a tokenizer and a Bert model from Huggingface transformers, and export the Bert model to CoreML:

from transformers import AutoTokenizer, AutoModelForTokenClassification
import torch

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained("huawei-noah/TinyBERT_General_4L_312D")

# Load the model
model = AutoModelForTokenClassification.from_pretrained("huawei-noah/TinyBERT_General_4L_312D")

# Example usage
text = "Hugging Face is creating a tool that democratizes AI."
inputs = tokenizer(text, return_tensors="pt")

Requirements:

pip install transformers torch

How can I export a tokenizer from Huggingface transformers to CoreML?


Solution

  • This is the BERT tokenizer I used and it works well. A lot of this is from Zach Nagengast and Julien Chaumond. Hope it helps! All you need is a vocab.txt file of the tokenizer's vocab which can be found here - https://huggingface.co/google-bert/bert-base-cased/blob/main/vocab.txt

    # Credit to Julien from Huggingface
    
    import Foundation
    
    enum TokenizerError: Error {
        case tooLong(String)
    }
    
    class BertTokenizer {
        private let basicTokenizer = BasicTokenizer()
        private let wordpieceTokenizer: WordpieceTokenizer
        private let maxLen = 512
        
        private let vocab: [String: Int]
        private let ids_to_tokens: [Int: String]
        
        init() {
            let url = Bundle.main.url(forResource: "vocab", withExtension: "txt")!
            let vocabTxt = try! String(contentsOf: url)
            let tokens = vocabTxt.split(separator: "\n").map { String($0) }
            var vocab: [String: Int] = [:]
            var ids_to_tokens: [Int: String] = [:]
            for (i, token) in tokens.enumerated() {
                vocab[token] = i
                ids_to_tokens[i] = token
            }
            self.vocab = vocab
            self.ids_to_tokens = ids_to_tokens
            self.wordpieceTokenizer = WordpieceTokenizer(vocab: self.vocab)
        }
        
        
        func tokenize(text: String) -> [String] {
            var tokens: [String] = []
            for token in basicTokenizer.tokenize(text: text) {
                for subToken in wordpieceTokenizer.tokenize(word: token) {
                    tokens.append(subToken)
                }
            }
            return tokens
        }
        
        private func createAttentionMask(tokenIds: [Int]) -> [Int] {
            return tokenIds.map { $0 != 0 ? 1 : 0 }
        }
        
        private func convertTokensToIds(tokens: [String]) -> [Int] {
            if tokens.count > maxLen {
                let truncatedTokens = Array(tokens.prefix(maxLen))
                return truncatedTokens.map { vocab[$0]! }
            } else {
                return tokens.map { vocab[$0]! }
            }
        }
        
        private func padSequence(_ sequence: [Int], toLength length: Int, paddingValue: Int = 0) -> [Int] {
            if sequence.count >= length {
                return Array(sequence.prefix(length))
            } else {
                return sequence + Array(repeating: paddingValue, count: length - sequence.count)
            }
        }
        
        /// Main entry point
        func tokenizeToIds(text: String, maxLength: Int = 512) -> (tokenIds: [Int], attentionMask: [Int]) {
            let tokens = ["[CLS]"] + tokenize(text: text) + ["[SEP]"]
            var tokenIds = convertTokensToIds(tokens: tokens)
            tokenIds = padSequence(tokenIds, toLength: maxLength)
            let attentionMask = createAttentionMask(tokenIds: tokenIds)
            return (tokenIds, attentionMask)
        }
        
        func tokenToId(token: String) -> Int {
            return vocab[token]!
        }
        
        /// Un-tokenization: get tokens from tokenIds
        func unTokenize(tokens: [Int]) -> [String] {
            return tokens.map { ids_to_tokens[$0]! }
        }
        
        /// Un-tokenization:
        func convertWordpieceToBasicTokenList(_ wordpieceTokenList: [String]) -> String {
            var tokenList: [String] = []
            var individualToken: String = ""
            
            for token in wordpieceTokenList {
                if token.starts(with: "##") {
                    individualToken += String(token.suffix(token.count - 2))
                } else {
                    if individualToken.count > 0 {
                        tokenList.append(individualToken)
                    }
                    
                    individualToken = token
                }
            }
            
            tokenList.append(individualToken)
            
            return tokenList.joined(separator: " ")
        }
    }
    
    
    
    class BasicTokenizer {
        let neverSplit = [
            "[UNK]", "[SEP]", "[PAD]", "[CLS]", "[MASK]"
        ]
        
        func tokenize(text: String) -> [String] {
            let splitTokens = text.folding(options: .diacriticInsensitive, locale: nil)
                .components(separatedBy: NSCharacterSet.whitespaces)
            let tokens = splitTokens.flatMap({ (token: String) -> [String] in
                if neverSplit.contains(token) {
                    return [token]
                }
                var toks: [String] = []
                var currentTok = ""
                for c in token.lowercased() {
                    if c.isLetter || c.isNumber || c == "°" {
                        currentTok += String(c)
                    } else if currentTok.count > 0 {
                        toks.append(currentTok)
                        toks.append(String(c))
                        currentTok = ""
                    } else {
                        toks.append(String(c))
                    }
                }
                if currentTok.count > 0 {
                    toks.append(currentTok)
                }
                return toks
            })
            return tokens
        }
    }
    
    
    class WordpieceTokenizer {
        private let unkToken = "[UNK]"
        private let maxInputCharsPerWord = 100
        private let vocab: [String: Int]
        
        init(vocab: [String: Int]) {
            self.vocab = vocab
        }
        
        func substr(_ s: String, _ r: Range<Int>) -> String? {
            let stringCount = s.count
            if stringCount < r.upperBound || stringCount < r.lowerBound {
                return nil
            }
            let startIndex = s.index(s.startIndex, offsetBy: r.lowerBound)
            let endIndex = s.index(startIndex, offsetBy: r.upperBound - r.lowerBound)
            return String(s[startIndex..<endIndex])
        }
        
        func tokenize(word: String) -> [String] {
            if word.count > maxInputCharsPerWord {
                return [unkToken]
            }
            var outputTokens: [String] = []
            var isBad = false
            var start = 0
            var subTokens: [String] = []
            while start < word.count {
                var end = word.count
                var cur_substr: String? = nil
                while start < end {
                    var substr = substr(word, start..<end)!
                    if start > 0 {
                        substr = "##\(substr)"
                    }
                    if vocab[substr] != nil {
                        cur_substr = substr
                        break
                    }
                    end -= 1
                }
                if cur_substr == nil {
                    isBad = true
                    break
                }
                subTokens.append(cur_substr!)
                start = end
            }
            if isBad {
                outputTokens.append(unkToken)
            } else {
                outputTokens.append(contentsOf: subTokens)
            }
            return outputTokens
        }
    }