I am recurrence andrew kaparthy`nano GPT,and now i am sure the code is identical to its code,which shows there is not any syntax error,but i encounter an error like this:
and the code:
import torch
import torch.nn as nn
from torch.nn import functional as F
batch_size =64
block_size =256
max_iters =5000
eval_interval =500
learn_rate=3e-4
device ='cuda' if torch.cuda.is_available() else 'cpu'
eval_iters =200
n_embd=384
n_head =6
n_layer=6
dropout =0.2
print('cuda is ok' if torch.cuda.is_available() else 'cuda i`nt ok')
#-------
torch.manual_seed(1337)
with open('input.txt','r',encoding='utf-8') as f:
text =f.read()
chars =sorted(list(set(text)))
vocab_size =len(chars)
stoi={ch:i for i,ch in enumerate(chars)}
itos={i:ch for i,ch in enumerate(chars)}
encode =lambda s:[stoi[c] for c in s]
decode = lambda l: ''.join([itos[i] for i in l])
data =torch.tensor(encode(text),dtype=torch.long)
n=int(0.9*len(data))
train_data=data[:n]
val_data=data[n:]
def get_batch(split):
data=train_data if split=='train' else val_data
ix =torch.randint(len(data)-block_size,(batch_size,))
x=torch.stack([data[i:i+block_size] for i in ix])
y=torch.stack([data[i+1:block_size+i+1] for i in ix])
x,y=x.to(device),y.to(device)
return x,y
@torch.no_grad()
def estimate_loss():
out={}
model.eval()
for split in ['train','val']:
losses =torch.zeros(eval_iters)
for k in range(eval_iters):
X,Y=get_batch(split)
logits,loss =model(X,Y)
losses[k]=loss.item()
out[split]=losses.mean()
model.train()
return out
class Head(nn.Module):
def __init__(self,head_size):
super().__init__()
self.key= nn.Linear(n_embd,head_size,bias=False)
self.query=nn.Linear(n_embd,head_size,bias=False)
self.value=nn.Linear(n_embd,head_size,bias=False)
self.register_buffer('tril',torch.tril(torch.ones(block_size,block_size)))
self.dropout =nn.Dropout(dropout)
def forward(self,x):
B,T,C=x.shape
k=self.key(x)
q=self.query(x)
wei =q@ k.transpose(-2,-1)*k.shape[-1]**-0.5
wei=wei.masked_fill(self.tril[:T,:T]==0,float('-inf'))
wei=F.softmax(wei,dim=-1)
wei =self.dropout(wei)
v=self.value(x)
out=wei@v
return out
class MutiHeadAttention(nn.Module):
def __init__(self,num_heads,head_size):
super().__init__()
self.heads=nn.ModuleList([Head(head_size) for _ in range(num_heads)])
self.proj =nn.Linear(head_size*num_heads,n_embd)
self.dropout=nn.Dropout(dropout)
def forward(self,x):
out=torch.cat([h(x) for h in self.heads],dim=-1)
out=self.dropout(self.proj(out))
return out
class FeedForward(nn.Module):
def __init__(self,n_embd):
super().__init__()
self.net=nn.Sequential(
nn.Linear(n_embd,4*n_embd),
nn.ReLU(),
nn.Linear(4*n_embd,n_embd),
nn.Dropout(dropout),
)
def forward(self,x):
return self.net(x)
class Block(nn.Module):
def __init__(self,n_embd,n_head):
super().__init__()
head_size=n_embd//n_head
self.sa =MutiHeadAttention(n_head,head_size)
self.ffwd=FeedForward(n_embd)
self.ln1=nn.LayerNorm(n_embd)
self.ln2=nn.LayerNorm(n_embd)
def forward(self,x):
x=x+self.sa(self.ln1(x))
x=x+self.ffwd(self.ln2(x))
return x
class GPTlanguageModel(nn.Module):
def __init__(self):
super().__init__()
self.token_embedding_table=nn.Embedding(vocab_size,n_embd)
self.position_embedding_table =nn.Embedding(vocab_size,n_embd)
self.blocks=nn.Sequential(*[Block(n_embd,n_head=n_head) for _ in range(n_layer)])
self.ln_f=nn.LayerNorm(n_embd)
self.lm_head=nn.Linear(n_embd,vocab_size)
self.apply(self._init_weights)
def _init_weights(self,module):
if isinstance(module,nn.Linear):
torch.nn.init.normal_(module.weight,mean=0.0,std=0.02)
if module.bias is not None:
torch.nn.init.zeros_(module.bias)
elif isinstance(module,nn.Embedding):
torch.nn.init.normal_(module.weight,mean=0.0,std=0.02)
def forward(self,idx,targets=None):
B,T=idx.shape
tok_emb=self.token_embedding_table(idx)
pos_emb=self.position_embedding_table(torch.arange(T,device=device))
x=tok_emb+pos_emb
x=self.blocks(x)
x=self.ln_f(x)
logits=self.lm_head(x)
if targets is None:
loss =None
else:
B,T,C=logits.shape
logits=logits.view(B*T,C)
targets=targets.view(B*T)
loss=F.cross_entropy(logits,targets)
return logits,loss
def generate(self,idx,max_new_tokens):
for _ in range(max_new_tokens):
idx_cond =idx[:,-block_size:]
logits,loss=self(idx_cond)
logits=logits[:-1:]
probs =F.softmax(logits,dim=-1)
idx_next=torch.multinomial(probs,num_samples=1)
idx=torch.cat((idx,idx_next),dim=1)
return idx
model=GPTlanguageModel()
m=model.to(device)
print(sum(p.numel() for p in m.parameters())/1e6,'M parameters')
optimizer= torch.optim.AdamW(model.parameters(),lr=learn_rate)
for iter in range(max_iters):
if iter % eval_interval ==0 or iter == max_iters-1:
losses =estimate_loss()
min_train_loss=losses['train']
if losses['train']<=min_train_loss:
torch.save(m.state_dict(),'model.pt')
min_train_loss=losses['train']
print(f"step{iter}:train loss {losses['train']:.4f},val loss {losses['val']:.4f}")
xb,yb =get_batch('train')
logits,loss = model(xb,yb)
optimizer.zero_grad(set_to_none=True)
loss.backward()
optimizer.step()
model.load_state_dict(torch.load('model.pt'))
#context =torch.zeros((1,1),dtype=torch.long,device=device)
#print(decode(m.generate(context,max_new_tokens=500)[0].tolist()))
expect guys can show me some ideas to solve the problem,and thank you all for your kindness
The problem is not necessarily because of the code itself. Your cuda version can also be problematic. Here from the stack trace in your image it looks like you might have an indexing error, but we can't know because CUDA reports the errors asynchronously. So the culprit is likely another line than the one reported. Add this to the top of your script and try to get the error again:
import os
os.environ[“CUDA_LAUNCH_BLOCKING”] = “1”
Then you will be able to see where exactly the problem comes from -or if there is no error, then install a new cuda environment.