python scikit-learn yolo train-test-split dataset

YoloV4 Custom Dataset Train Test Split

I try to train a Yolo Net with my custom Dataset. I have some Images (*.jpg) and the labels/annotations in the yolo format as a txt-file.

Now I want to split the data in a train and validation set. As a result I want a train and a validation folder each with their own images and annotations.

I tried something like this:

from sklearn.model_selection import train_test_split
import glob


# Get all paths to your images files and text files
PATH = '../TrainingsData/'
img_paths = glob.glob(PATH+'*.jpg')
txt_paths = glob.glob(PATH+'*.txt')
    
X_train, X_test, y_train, y_test = train_test_split(img_paths, txt_paths, test_size=0.3, random_state=42)

After saving the set to a new folder, the images and annotations got mixed up. So for example in the train folder, some images had no annotation (they were in the validation folder) and there were some annotaions but the image was missing.

Can you help me to split my dataset?

Solution

Ok !!, You can do this

Split images function

def split_img_label(data_train,data_test,folder_train,folder_test):
    
    os.mkdir(folder_train)
    os.mkdir(folder_test)
    
    
    train_ind=list(data_train.index)
    test_ind=list(data_test.index)
    
    
    # Train folder
    for i in tqdm(range(len(train_ind))):
        
        os.system('cp '+data_train[train_ind[i]]+' ./'+ folder_train + '/'  +data_train[train_ind[i]].split('/')[2])
        os.system('cp '+data_train[train_ind[i]].split('.jpg')[0]+'.txt'+'  ./'+ folder_train + '/'  +data_train[train_ind[i]].split('/')[2].split('.jpg')[0]+'.txt')
    
    # Test folder
    for j in tqdm(range(len(test_ind))):
        
        os.system('cp '+data_test[test_ind[j]]+' ./'+ folder_test + '/'  +data_test[test_ind[j]].split('/')[2])
        os.system('cp '+data_test[test_ind[j]].split('.jpg')[0]+'.txt'+'  ./'+ folder_test + '/'  +data_test[test_ind[j]].split('/')[2].split('.jpg')[0]+'.txt')

CODE


import pandas as pd 
import os 

PATH = './TrainingsData/'
list_img=[img for img in os.listdir(PATH) if img.endswith('.jpg')==True]
list_txt=[img for img in os.listdir(PATH) if img.endswith('.txt')==True]

path_img=[]

for i in range (len(list_img)):
    path_img.append(PATH+list_img[i])
    
df=pd.DataFrame(path_img)

# split 
data_train, data_test, labels_train, labels_test = train_test_split(df[0], df.index, test_size=0.20, random_state=42)

# Function split 
split_img_label(data_train,data_test,folder_train_name,folder_test_name)

OUTPUT

len(list_img)
583

100%|████████████████████████████████████████████████████████████████████████████████| 466/466 [00:26<00:00, 17.42it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 117/117 [00:07<00:00, 16.61it/s]

Finally, you will have 2 folders (folder_train_name,folder_test_name) with the same images and labels .