Search code examples
pythonpandasnumpyanalysissliding-window

Check if this code uses a data sliding window


I want help to check if the following code (from https://github.com/jsh4887/ConvLSTM/blob/main/1.%20Make_dataset.py):

import os
import numpy as np
import pandas as pd
import pickle
from sklearn.preprocessing import MinMaxScaler


def createFolder(directory):
    try:
        if not os.path.exists(directory):
            os.makedirs(directory)
    except OSError:
        print('Error: Creating directory. ' + directory)


def Make_Dataset_df(path, start_year, end_year, start_lon, end_lon, lon_bin, start_lat, end_lat, lat_bin):
    dif_year = (end_year - start_year) + 1
    total_year = np.linspace(start_year, end_year, dif_year, dtype=int)

    real_df = pd.DataFrame()

    path = path

    for y in range(len(total_year)):

        data_path = path + str(total_year[y]) + '/'
        file_list = os.listdir(data_path)
        file_list.sort()

        Error_msg = data_path + '.DS_Store'

        if os.path.exists(Error_msg):
            os.remove(Error_msg)

            data_path = path + str(total_year[y]) + '/'
            file_list = os.listdir(data_path)
            file_list.sort()

        else:
            print("Can not delete the file as it doesn't exists")

        for O_f in range(len(file_list)):

            DCGAN_file = data_path + file_list[O_f] + '/completed_pb_lr/1950.txt'

            if os.path.exists(DCGAN_file):

                # DCGAN-PB result
                with open(DCGAN_file, "r") as file:
                    DCGAN_pb_value = np.array([float(i) for line in file for i in line.split('/n') if i.strip()])

                # DCGAN_PB_result = np.reshape(DCGAN_pb_list, (32, 32))
                obs_year = int(file_list[O_f][16:20])
                obs_month = int(file_list[O_f][21:23])
                obs_day = int(file_list[O_f][24:26])
                obs_hour = int(file_list[O_f][27:29])

                # spec_time = datetime(obs_year, obs_month, obs_day, obs_hour, 0, 0)

                nan_value = np.where((DCGAN_pb_value == 9999.0) | (DCGAN_pb_value < 0.5) | (DCGAN_pb_value > 100))

                if len(nan_value[0]) == 0:
                    DCGAN_pb_value = DCGAN_pb_value
                else:
                    DCGAN_pb_value[nan_value] = np.nan

                date_data = {'Year': int(obs_year),
                             'Month': int(obs_month),
                             'Day': int(obs_day),
                             'Hour': int(obs_hour)}

                df_1_date = pd.DataFrame(date_data, index=[0])
                df_1_TEC = (pd.DataFrame(DCGAN_pb_value)).transpose()
                df_comb = pd.concat([df_1_date, df_1_TEC], axis=1)

                real_df = pd.concat([real_df, df_comb], axis=0)

            else:

                print('There is no data')

    return real_df


################### Setting ###################

start_year = 2010
end_year = 2010
start_lat = 25.5
end_lat = 41
start_lon = 120
end_lon = 135.5
lat_bin = 0.5
lon_bin = 0.5

path = '/Users/jeongseheon/Desktop/JSH/[2] Data/DCGAN_result/'
saving_path = '/Users/jeongseheon/Desktop/JSH/[1] Project/Forecast_TEC_convLSTM/Dataset/'
# createFolder(saving_path+'initial_dataset_2d')
title_name = 'DCGAN_PB_TEC_' + str(start_year) + '_' + str(end_year) + ''

if os.path.exists(saving_path + 'initial_dataset_2d/' + title_name + '_Dataset_'+str(start_year)+'_'
                  +str(end_year)+'.csv'):

    df_values = pd.read_csv(saving_path + 'initial_dataset_2d/' + title_name + '_Dataset_'+str(start_year)+'_'
                  +str(end_year)+'.csv')
    date = df_values.values[:, 1:5]
    values = df_values.values[:, 5:]

else:

    df_data = Make_Dataset_df(path, start_year, end_year, start_lon, end_lon, lon_bin, start_lat, end_lat, lat_bin)
    df_data_intp = df_data.interpolate()
    date = df_data_intp.values[:, 0:4]
    values = df_data_intp.values[:, 4:]

    # interpolation을 했음에도 불구하고 첫 값이 NAN인 경우 interpolation을 하지 못함 따라서 0으로 변경
    nan_value = np.where(np.isnan(values) == True)
    values[nan_value] = 0

    df_date = pd.DataFrame(date)
    df_values = pd.DataFrame(values)

    save_df = pd.concat([df_date, df_values], axis=1)

    save_df.to_csv(saving_path + 'initial_dataset_2d/' + title_name + '_Dataset_'+str(start_year)+'_'
                  +str(end_year)+'.csv')


def generate_dataset(data, date, n_samples, past_history, future_target):
    scaler = MinMaxScaler(feature_range=(0, 1))
    scaled_data = scaler.fit_transform(data)
    data_4d = scaled_data.reshape(n_samples, 32, 32, 1)

    date_data_n_frames = []
    for i in range(n_samples - past_history - future_target - future_target):  # 영화 샘플수만큼 반복

        for t in range(past_history):

            hist_data = data_4d[i + t, :, :, :]  # 32 * 32 * 1
            hist_data_n_frames_1 = hist_data.reshape(1, 32, 32, 1)
            date_data_n_frames_1 = date[i + (t + future_target), :]

            if t > 0:
                hist_data_n_frames = np.concatenate([hist_data_n_frames, hist_data_n_frames_1])
                date_data_n_frames = np.concatenate([date_data_n_frames, date_data_n_frames_1])
            else:
                hist_data_n_frames = hist_data_n_frames_1
                date_data_n_frames = date_data_n_frames_1

        for f in range(future_target):

            next_data = data_4d[i + (f + future_target), :, :, :]  # 32 * 32 * 1
            next_data_n_frames_1 = next_data.reshape(1, 32, 32, 1)

            if f > 0:
                next_data_n_frames = np.concatenate([next_data_n_frames, next_data_n_frames_1])
            else:
                next_data_n_frames = next_data_n_frames_1

        hist_data_5d_1 = hist_data_n_frames.reshape(1, -1, 32, 32, 1)
        next_data_5d_1 = next_data_n_frames.reshape(1, -1, 32, 32, 1)
        date_data_1 = date_data_n_frames.reshape(1, -1)

        if i > 0:
            hist_data_5d = np.concatenate([hist_data_5d, hist_data_5d_1])
            next_data_5d = np.concatenate([next_data_5d, next_data_5d_1])
            date_data_5d = np.concatenate([date_data_5d, date_data_1])
        else:
            hist_data_5d = hist_data_5d_1
            next_data_5d = next_data_5d_1
            date_data_5d = date_data_1

        print('Data shape:' + str(hist_data_5d.shape))
        print('Future shape:' + str(next_data_5d.shape))

    return hist_data_5d, next_data_5d, date_data_5d, scaler


past_history = 24
future_target = 24

hist_data_5d, next_data_5d, date_data_5d, scaler = generate_dataset(values, date,\
                                                                    values.shape[0], past_history, future_target)

createFolder(saving_path + 'model_input_dataset_5d')
with open(saving_path + 'model_input_dataset_5d/' + title_name + '_past_' + str(past_history) + '_Dataset_'
          +str(start_year)+'_'+str(end_year)+'.pickle', 'wb') as t1:
    pickle.dump([hist_data_5d, next_data_5d, date_data_5d, scaler], t1)

uses a sliding window with stride smaller than 24 frames in each iteration, in relation to the frames used in each record of the final array given by using the hist_data_5d, next_data_5d, date_data_5d, scaler = generate_dataset(values, date, values.shape[0], past_history, future_target) code, and what would be the value of stride. I think the stride is 24 frames, but I'm very confused about reading the code.


Solution

  • Yes it’s using sliding windows in this sense. If you look the function generate_dataset,

    • it iterates over each sample in the dataset n_samples, minus the past and future target frames to ensure there's enough data for historical and future frames.

    • Within each iteration, it iterates over the past_history frames to collect historical data hist_data and their corresponding dates date_data. These frames are concatenated together to form a sequence of historical frames hist_data_n_frames and dates date_data_n_frames.

    • It also iterates over the future_target frames to collect future data next_data. These frames are concatenated together to form a sequence of future frames next_data_n_frames.

    The sliding window is the nested loops over past_history and future_target, which shift the window by one frame at each iteration to create overlapping sequences.

    Usually if you see a for loop and index slicing, you have a moving window. This is my rule of thumb but others might disagree.

    Hope this helps!

    I will add the part of the code which you asked for.

    for i in range(n_samples - past_history - future_target - future_target):
    
        for t in range(past_history):
    
            # Collect historical data
    
        for f in range(future_target):
    
            # Collect future data
    

    all of these increment by 1

    Because the loops increment by 1 at each iteration (range(x) defaults to starting at 0 and incrementing by 1 until it reaches x - 1), the stride between each frame in the sequence is 1.