I want to fine tune the blip model on ROCO database for image captioning chest x-ray images. But I am getting an error regarding integer indexing.
Can anyone please help me understand the cause of the error and how to rectify it.
This is the code:
def read_data(filepath,csv_path,n_samples):
df = pd.read_csv(csv_path)
images = []
capts = []
for idx in range(len(df)):
if 'hest x-ray' in df['caption'][idx] or 'hest X-ray' in df['caption'][idx]:
if len(images)>n_samples:
break
else:
images.append(Image.open(os.path.join(filepath,df['name'][idx])).convert('L'))
capts.append(df['caption'][idx])
return images, capts
def get_data():
imgtrpath = 'all_data/train/radiology/images'
trcsvpath = 'all_data/train/radiology/traindata.csv'
imgtspath = 'all_data/test/radiology/images'
tscsvpath = 'all_data/test/radiology/testdata.csv'
imgvalpath = 'all_data/validation/radiology/images'
valcsvpath = 'all_data/validation/radiology/valdata.csv'
print('Extracting Training Data')
trainimgs, traincapts = read_data(imgtrpath, trcsvpath, 1800)
print('Extracting Testing Data')
testimgs, testcapts = read_data(imgtrpath, trcsvpath, 100)
print('Extracting Validation Data')
valimgs, valcapts = read_data(imgtrpath, trcsvpath, 100)
return trainimgs, traincapts, testimgs, testcapts, valimgs, valcapts
def compute_metrics(eval_pred):
logits, labels = eval_pred
predictions = np.argmax(logits, axis=-1)
return metric.compute(predictions=predictions, references=labels)
trainimgs, traincapts, testimgs, testcapts, valimgs, valcapts = get_data()
model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-large")
processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-large")
metric = evaluate.load("accuracy")
traindata = processor(text=traincapts, images=trainimgs, return_tensors="pt", padding=True, truncation=True)
evaldata = processor(text=testcapts, images=testimgs, return_tensors="pt", padding=True, truncation=True)
training_args = TrainingArguments(output_dir="test_trainer", evaluation_strategy="epoch")
trainer = Trainer(
model=model,
args=training_args,
train_dataset=traindata,
eval_dataset=evaldata,
compute_metrics=compute_metrics
)
trainer.train()
The code is meant to fine-tune the BLIP model on the ROCO dataset chest x-ray images for the purpose of image captioning. But when I run it, I am getting this error:
File "C:\Users\omair\anaconda3\envs\torch\lib\site-packages\transformers\feature_extraction_utils.py", line 86, in __getitem__
raise KeyError("Indexing with integers is not available when using Python based feature extractors")
KeyError: 'Indexing with integers is not available when using Python based feature extractors'
There are two issues here:
...capts
are passed as the model's "Question". There is an example on how to do that in the link below.ValueError: Expected input batch_size (0) to match target batch_size (511).
which can be solved if you put the effort to reproduce the changes made on BlipForQuestionAnswering to BlipForConditionalGeneration.