How to solve: RuntimeError: CUDA error: device-side assert triggered?

I want to use the paraphrase-multilingual-mpnet-base-v2 model to build embeddings and I got this error:

RuntimeError: CUDA error: device-side assert triggered

The error occurs by executing string = {k: v.to(device=device) for k, v in string.items()}.

Why do I get the error?

I work in a Google Colab with 12.7 GB RAM and 16 GB GPU-RAM

The goal of the code is to generate sentence embeddings. With some customizing is a chunk-wise execution also possible.

The complete error message:

RuntimeError                              Traceback (most recent call last) <ipython-input-17-8e6bf00d9e24> in <cell line: 104>()
    102     return np.nan
    103 
--> 104 processed_data = processDataRAG(df[5000:], tokenizer, model)

4 frames <ipython-input-17-8e6bf00d9e24> in processDataRAG(data, tokenizer, model)
     10   sents = [str(sentences[0]) for sentences in article_sentences]
     11   number_of_article =[sentences[1] for sentences in article_sentences]
---> 12   embedded_sentencs = [embeddChunkwise(sentence, tokenizer, model, 512) for sentence in tqdm(sents, desc = "Create chunk-wise embeddings")]
     13   return pd.DataFrame({
     14       "sentences": sents,

<ipython-input-17-8e6bf00d9e24> in <listcomp>(.0)
     10   sents = [str(sentences[0]) for sentences in article_sentences]
     11   number_of_article =[sentences[1] for sentences in article_sentences]
---> 12   embedded_sentencs = [embeddChunkwise(sentence, tokenizer, model, 512) for sentence in tqdm(sents, desc = "Create chunk-wise embeddings")]
     13   return pd.DataFrame({
     14       "sentences": sents,

<ipython-input-17-8e6bf00d9e24> in embeddChunkwise(string, tokenizer, model, chunk_size)
     55     #encoded_input = tokenizer(tokenizer.detokenize(tokenized_chunk))
     56     if len(encoded_chunk) > 0:
---> 57       embedded_chunk = createEmbeddings(
     58           tokenizer(tokenizer.decode(encoded_chunk, skip_special_tokens  = True), return_tensors='pt', add_special_tokens=False),
     59           model

<ipython-input-17-8e6bf00d9e24> in createEmbeddings(string, model)
     77   #print("Length of input_ids: ", len(string["input_ids"][0]))
     78   if "

input_ids" in string.keys():
---> 79     string = {k: v.to(device=device) for k, v in string.items()}
     80     with torch.no_grad():
     81 

<ipython-input-17-8e6bf00d9e24> in <dictcomp>(.0)
     77   #print("Length of input_ids: ", len(string["input_ids"][0]))
     78   if "input_ids" in string.keys():
---> 79     string = {k: v.to(device=device) for k, v in string.items()}
     80     with torch.no_grad():
     81 

RuntimeError: CUDA error: device-side assert triggered CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect. For debugging consider passing CUDA_LAUNCH_BLOCKING=1. Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.

I run this code:

from transformers import AutoTokenizer, AutoModel
import torch
from torch import cuda

def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

# Select device globally
device = f'cuda:{cuda.current_device()}' if cuda.is_available() else 'cpu'

# Load model from HuggingFace Hub
tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/paraphrase-multilingual-mpnet-base-v2')
model = AutoModel.from_pretrained('sentence-transformers/paraphrase-multilingual-mpnet-base-v2',
                                  device_map = device)
df = pd.read_json(file_path)

def processDataRAG(data, tokenizer, model):
  
  article_sentences = data.content.progress_apply(lambda x: list(nlp_de(x).sents))
  #tokenized_articles = data.content.progress_apply(lambda article: tokenizeChunkwise(article, tokenizer, 512))
  
  article_sentences = [
      (sentences, idx) for idx, article in tqdm(enumerate(list(article_sentences)), desc="Loop over articles with index") 
      for sentences in article
      ]
  sents = [str(sentences[0]) for sentences in article_sentences]
  number_of_article =[sentences[1] for sentences in article_sentences]
  embedded_sentencs = [embeddChunkwise(sentence, tokenizer, model, 512) for sentence in tqdm(sents, desc = "Create chunk-wise embeddings")]
  return pd.DataFrame({
      "sentences": sents,
      "embeddings": embedded_sentencs,
      "article": number_of_article
  })

def embeddChunkwise(string, tokenizer, model, chunk_size):
  decreasing_by_special_tokens = 0 # Because of speical tokens at the beginning and end
  encoded_string = tokenizer(string, add_special_tokens=False)
  if len(encoded_string["input_ids"])/chunk_size > 1:
    print("Tokenized_string:", encoded_string)
    print("Total tokens: ", str(len(encoded_string["input_ids"])))
    print("Tokenized string in chunks: ", str(len(encoded_string["input_ids"])/chunk_size), " --- " , str(len(encoded_string["input_ids"])//chunk_size +1))
  embedded_chunks = []
  for idx in list(range(len(encoded_string["input_ids"])//chunk_size +1 )):
    encoded_chunk=None

    if (chunk_size-decreasing_by_special_tokens)*(idx+1) < len(encoded_string["input_ids"]): # sentences with 1000 words as instances
      start_idx, end_idx = (chunk_size*idx - decreasing_by_special_tokens*idx, chunk_size*(idx+1) - decreasing_by_special_tokens*(idx+1))

      encoded_chunk = encoded_string["input_ids"][start_idx:end_idx]

    else: # If it is a sentences with 20 words as instance
      if chunk_size-decreasing_by_special_tokens > len(encoded_string["input_ids"]):
        encoded_chunk = encoded_string["input_ids"][chunk_size*(idx) - decreasing_by_special_tokens*(idx):]
      else:
        
        encoded_chunk = encoded_string["input_ids"][-(chunk_size*(idx) - decreasing_by_special_tokens*(idx)):]

    if len(encoded_chunk) > 0:
      embedded_chunk = createEmbeddings(
          tokenizer(tokenizer.decode(encoded_chunk, skip_special_tokens  = True), return_tensors='pt', add_special_tokens=False), 
          model
          )
      if isinstance(embedded_chunk, list):
        embedded_chunks.append(embedded_chunk[0])
  if len(embedded_chunks) > 1:
    return embedded_chunks
  elif len(embedded_chunks) == 0:
    return np.nan
  else:
    return embedded_chunks[0]

def createEmbeddings(string, model):
  if "input_ids" in string.keys():
    string = {k: v.to(device=device) for k, v in string.items()}
    with torch.no_grad():
      
        try:
          model_output = model(**string)
        except Exception as ex:
          print("--- Error by creating Embeddings ---")
          print("Error: ", str(ex))
          return np.nan
    # Perform pooling. In this case, average pooling
    try:
      sentence_embeddings = mean_pooling(model_output, string['attention_mask'])
    except Exception as ex:
      print("--- Error by pooling embeddings ---")
      print("Model output: ", str(model_output))
      print("Attention_mask: ", str(string['attention_mask']))
      print("Error: ", str(ex))
      return np.nan
    sentence_embeddings = sentence_embeddings.detach().cpu().numpy()
    return sentence_embeddings
  else:
    return np.nan

Solution

I have found the cause of the error in my case: It depends on the input size. The model accept an input size of 512 tokens and I pass 513 tokens. The cause of the "too long input" is in this line of code:

encoded_chunk = encoded_string["input_ids"][-(chunk_size*(idx) - decreasing_by_special_tokens*(idx)):]

I have to add -1:

encoded_chunk = encoded_string["input_ids"][-(chunk_size*(idx) - decreasing_by_special_tokens*(idx)-1):]

All in all, the cause was the wrong input size.