RuntimeError: CUDA error: device-side assert_triggered

I am recurrence andrew kaparthy`nano GPT,and now i am sure the code is identical to its code,which shows there is not any syntax error,but i encounter an error like this: enter image description here and the code:

import torch
import torch.nn as nn
from torch.nn import functional as F

batch_size =64
block_size =256
max_iters =5000
eval_interval =500
learn_rate=3e-4
device ='cuda' if torch.cuda.is_available() else 'cpu'
eval_iters =200
n_embd=384
n_head =6
n_layer=6
dropout =0.2

print('cuda is ok' if torch.cuda.is_available() else 'cuda i`nt ok')
#-------

torch.manual_seed(1337)
with open('input.txt','r',encoding='utf-8') as f:
    text =f.read()
chars =sorted(list(set(text)))
vocab_size =len(chars)
stoi={ch:i for i,ch in enumerate(chars)}
itos={i:ch for i,ch in enumerate(chars)}

encode =lambda s:[stoi[c] for c in s]
decode = lambda l: ''.join([itos[i] for i in l])
data =torch.tensor(encode(text),dtype=torch.long)
n=int(0.9*len(data))

train_data=data[:n]
val_data=data[n:]

def get_batch(split):
    data=train_data if split=='train' else val_data
    ix =torch.randint(len(data)-block_size,(batch_size,))
    x=torch.stack([data[i:i+block_size] for i in ix])
    y=torch.stack([data[i+1:block_size+i+1] for i in ix])
    x,y=x.to(device),y.to(device)
    return x,y

@torch.no_grad()
def estimate_loss():
    out={}
    model.eval()
    for split in ['train','val']:
        losses =torch.zeros(eval_iters)
        for k in range(eval_iters):
            X,Y=get_batch(split)
            logits,loss =model(X,Y)
            losses[k]=loss.item()
        out[split]=losses.mean()
    model.train()
    return out

class Head(nn.Module):
    def __init__(self,head_size):
        super().__init__()
        self.key= nn.Linear(n_embd,head_size,bias=False)
        self.query=nn.Linear(n_embd,head_size,bias=False)
        self.value=nn.Linear(n_embd,head_size,bias=False)
        self.register_buffer('tril',torch.tril(torch.ones(block_size,block_size)))
        self.dropout =nn.Dropout(dropout)

    def forward(self,x):
        B,T,C=x.shape
        k=self.key(x)
        q=self.query(x)
        wei =q@ k.transpose(-2,-1)*k.shape[-1]**-0.5
        wei=wei.masked_fill(self.tril[:T,:T]==0,float('-inf'))
        wei=F.softmax(wei,dim=-1)
        wei =self.dropout(wei)
        v=self.value(x)
        out=wei@v
        return out

class MutiHeadAttention(nn.Module):
    def __init__(self,num_heads,head_size):
        super().__init__()
        self.heads=nn.ModuleList([Head(head_size) for _ in range(num_heads)])
        self.proj =nn.Linear(head_size*num_heads,n_embd)
        self.dropout=nn.Dropout(dropout)
    def forward(self,x):
        out=torch.cat([h(x) for h in self.heads],dim=-1)
        out=self.dropout(self.proj(out))
        return out

class FeedForward(nn.Module):
    def __init__(self,n_embd):
        super().__init__()
        self.net=nn.Sequential(
            nn.Linear(n_embd,4*n_embd),
            nn.ReLU(),
            nn.Linear(4*n_embd,n_embd),
            nn.Dropout(dropout),
        )

    def forward(self,x):
        return  self.net(x)

class Block(nn.Module):
    def __init__(self,n_embd,n_head):
        super().__init__()
        head_size=n_embd//n_head
        self.sa =MutiHeadAttention(n_head,head_size)
        self.ffwd=FeedForward(n_embd)
        self.ln1=nn.LayerNorm(n_embd)
        self.ln2=nn.LayerNorm(n_embd)

    def forward(self,x):
        x=x+self.sa(self.ln1(x))
        x=x+self.ffwd(self.ln2(x))
        return x

class GPTlanguageModel(nn.Module):

    def __init__(self):
        super().__init__()
        self.token_embedding_table=nn.Embedding(vocab_size,n_embd)
        self.position_embedding_table =nn.Embedding(vocab_size,n_embd)
        self.blocks=nn.Sequential(*[Block(n_embd,n_head=n_head) for _ in range(n_layer)])
        self.ln_f=nn.LayerNorm(n_embd)
        self.lm_head=nn.Linear(n_embd,vocab_size)
        self.apply(self._init_weights)

    def _init_weights(self,module):
        if isinstance(module,nn.Linear):
            torch.nn.init.normal_(module.weight,mean=0.0,std=0.02)
            if module.bias is not None:
                torch.nn.init.zeros_(module.bias)
        elif isinstance(module,nn.Embedding):
            torch.nn.init.normal_(module.weight,mean=0.0,std=0.02)

    def forward(self,idx,targets=None):
        B,T=idx.shape
        tok_emb=self.token_embedding_table(idx)
        pos_emb=self.position_embedding_table(torch.arange(T,device=device))
        x=tok_emb+pos_emb
        x=self.blocks(x)
        x=self.ln_f(x)
        logits=self.lm_head(x)

        if targets is None:
            loss =None
        else:
            B,T,C=logits.shape
            logits=logits.view(B*T,C)
            targets=targets.view(B*T)
            loss=F.cross_entropy(logits,targets)

        return  logits,loss

    def generate(self,idx,max_new_tokens):
        for _ in range(max_new_tokens):
            idx_cond =idx[:,-block_size:]
            logits,loss=self(idx_cond)
            logits=logits[:-1:]
            probs =F.softmax(logits,dim=-1)
            idx_next=torch.multinomial(probs,num_samples=1)
            idx=torch.cat((idx,idx_next),dim=1)
        return idx

model=GPTlanguageModel()
m=model.to(device)
print(sum(p.numel() for p in m.parameters())/1e6,'M parameters')

optimizer= torch.optim.AdamW(model.parameters(),lr=learn_rate)

for iter in range(max_iters):
    if iter % eval_interval ==0 or iter == max_iters-1:
        losses =estimate_loss()
        min_train_loss=losses['train']
        if losses['train']<=min_train_loss:
            torch.save(m.state_dict(),'model.pt')
            min_train_loss=losses['train']
        print(f"step{iter}:train loss {losses['train']:.4f},val loss {losses['val']:.4f}")

    xb,yb =get_batch('train')
    logits,loss = model(xb,yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()
model.load_state_dict(torch.load('model.pt'))



#context =torch.zeros((1,1),dtype=torch.long,device=device)
#print(decode(m.generate(context,max_new_tokens=500)[0].tolist()))

expect guys can show me some ideas to solve the problem,and thank you all for your kindness

Solution

The problem is not necessarily because of the code itself. Your cuda version can also be problematic. Here from the stack trace in your image it looks like you might have an indexing error, but we can't know because CUDA reports the errors asynchronously. So the culprit is likely another line than the one reported. Add this to the top of your script and try to get the error again:

import os
os.environ[“CUDA_LAUNCH_BLOCKING”] = “1”

Then you will be able to see where exactly the problem comes from -or if there is no error, then install a new cuda environment.