Search code examples
performancerustbinaryfiles

Rust Deduped Cow HashSet


Wanting to store a large number of byte arrays and aim to do this with a AHashMap<Vec<u8>> and using Cow borrow from it and only write when needed, this should also cause it to be deduplicated. However my attempts so far have been futile:

#![feature(hash_set_entry)]

use std::borrow::{Borrow, Cow, ToOwned};
use std::hash::Hash;

use ahash::AHashSet;

#[derive(Debug)]
struct CipherText<'a> {
    ciphertext: Cow<'a, Vec<u8>>,
    ciphers: Vec<Cipher<'a>>,
}

#[derive(Debug)]
struct Cipher<'a> {
    cipher_id: Cow<'a, Vec<u8>>,
    keys: Vec<Cow<'a, Vec<u8>>>,
}

fn main() {
    let mut string_table: AHashSet<Vec<u8>> = vec![
        "Hello World!".as_bytes().to_vec(),
        "atbash".as_bytes().to_vec(),
        "caesar_decrypt".as_bytes().to_vec(),
        "5".as_bytes().to_vec(),
    ]
    .into_iter()
    .collect();

    let mut ciphertexts: Vec<CipherText> = vec![
        CipherText {
            ciphertext: Cow::Borrowed(
                string_table
                    .get(&"Hello World!".as_bytes().to_vec())
                    .unwrap(),
            ),
            ciphers: vec![Cipher {
                cipher_id: Cow::Borrowed(string_table.get(&"atbash".as_bytes().to_vec()).unwrap()),
                keys: vec![],
            }],
        },
        CipherText {
            ciphertext: Cow::Borrowed(
                string_table
                    .get(&"Hello World!".as_bytes().to_vec())
                    .unwrap(),
            ),
            ciphers: vec![Cipher {
                cipher_id: Cow::Borrowed(
                    string_table
                        .get(&"caesar_decrypt".as_bytes().to_vec())
                        .unwrap(),
                ),
                keys: vec![Cow::Borrowed(
                    string_table.get(&"5".as_bytes().to_vec()).unwrap(),
                )],
            }],
        },
    ];

    string_table.insert("TEST".as_bytes().to_vec());
    string_table.insert("TEST2".as_bytes().to_vec());

    ciphertexts[0].ciphertext = Cow::Borrowed(
        &string_table.get_or_insert_owned(&"Goodbye Cruel World...".as_bytes().to_vec()),
    );
}

Both of the TEST lines as well as the ciphertext[0] line error as follows

error[E0502]: cannot borrow `string_table` as mutable because it is also borrowed as immutable
  --> src/main.rs:61:5
   |
33 |                 string_table
   |                 ------------ immutable borrow occurs here
...
61 |     string_table.insert("TEST".as_bytes().to_vec());
   |     ^^^^^^^^^^^^ mutable borrow occurs here
...
64 |     ciphertexts[0].ciphertext = Cow::Borrowed(
   |     ----------- immutable borrow later used here

My aim is for all the byte arrays to only be references and then clone, add to the string_table and reference that if I change it. This data will be stored in a custom binary format and this is the start of the process for writing the serialiser and deserialiser. Hope this all makes sense!


Solution

  • You cannot mutate string_table while a CipherText holds a reference to it. This is simply one of Rust's core invariants. So you cannot use Cow. The typical ways around it are generally either:

    1. Use indexes/keys to reference the other structure, which unfortunately I can't see a good way to do that in this scenario.

    2. Use Rcs so that the ciphertexts and the table share ownership of the bytes. This does mean that the ciphertexts can stand on their own, but you can always use the table as a broker to dedup elements before using them with judicious use of get_or_insert.

      #![feature(hash_set_entry)]
      
      use std::rc::Rc;
      use ahash::AHashSet;
      
      #[derive(Debug)]
      struct CipherText {
          ciphertext: Rc<Vec<u8>>,
          ciphers: Vec<Cipher>,
      }
      
      #[derive(Debug)]
      struct Cipher {
          cipher_id: Rc<Vec<u8>>,
          keys: Vec<Rc<Vec<u8>>>,
      }
      
      fn main() {
          let mut string_table: AHashSet<Rc<Vec<u8>>> = vec![
              Rc::new("Hello World!".as_bytes().to_vec()),
              Rc::new("atbash".as_bytes().to_vec()),
              Rc::new("caesar_decrypt".as_bytes().to_vec()),
              Rc::new("5".as_bytes().to_vec()),
          ]
          .into_iter()
          .collect();
      
          let mut ciphertexts: Vec<CipherText> = vec![
              CipherText {
                  ciphertext: string_table
                      .get_or_insert(Rc::new("Hello World!".as_bytes().to_vec()))
                      .clone(),
                  ciphers: vec![Cipher {
                      cipher_id: string_table
                          .get_or_insert(Rc::new("atbash".as_bytes().to_vec()))
                          .clone(),
                      keys: vec![],
                  }],
              },
              CipherText {
                  ciphertext: string_table
                      .get_or_insert(Rc::new("Hello World!".as_bytes().to_vec()))
                      .clone(),
                  ciphers: vec![Cipher {
                      cipher_id: string_table
                          .get_or_insert(Rc::new("caesar_decrypt".as_bytes().to_vec()))
                          .clone(),
                      keys: vec![string_table
                          .get_or_insert(Rc::new("5".as_bytes().to_vec()))
                          .clone()],
                  }],
              },
          ];
      
          string_table.insert(Rc::new("TEST".as_bytes().to_vec()));
          string_table.insert(Rc::new("TEST2".as_bytes().to_vec()));
      
          ciphertexts[0].ciphertext = string_table
              .get_or_insert(Rc::new("Goodbye Cruel World...".as_bytes().to_vec()))
              .clone();
      }
      

      This will have the same effect as Cow, the elements owned by the Rcs are immutable so a new element must be made to make changes.