Search code examples
python-3.xdatasetsynthetic

How to create a dataframe of a particular size containing both continuous and categorical values with a uniform random distribution


So, I'm trying to generate some fake random data of a given dimension size. Essentially, I want a dataframe in which the data has a uniform random distribution. The data consist of both continuous and categorical values. I've written the following code, but it doesn't work the way I want it to be.

import random
import pandas as pd
import time
from datetime import datetime

# declare global variables
adv_name = ['soft toys', 'kitchenware', 'electronics',
            'mobile phones', 'laptops']
adv_loc = ['location_1', 'location_2', 'location_3',
       'location_4', 'location_5']
adv_prod = ['baby product', 'kitchenware', 'electronics',
            'mobile phones', 'laptops']
adv_size = [1, 2, 3, 4, 10]
adv_layout = ['static', 'dynamic']  # advertisment layout type on website

# adv_date, start_time, end_time = []
num = 10 # the given dimension

# define function to generate random advert locations
def rand_shuf_loc(str_lst, num):
    lst = adv_loc
    # using list comprehension
    rand_shuf_str = [item for item in lst for i in range(num)]
    return(rand_shuf_str)
    

# define function to generate random advert names
def rand_shuf_prod(loc_list, num):
    rand_shuf_str = [item for item in loc_list for i in range(num)]
    random.shuffle(rand_shuf_str)
    return(rand_shuf_str)

# define function to generate random impression and click data
def rand_clic_impr(num):
    rand_impr_lst = []
    click_lst = []
    for i in range(num):
        rand_impr_lst.append(random.randint(0, 100))
        click_lst.append(random.randint(0, 100))
    return {'rand_impr_lst': rand_impr_lst, 'rand_click_lst': click_lst}

# define function to generate random product price and discount
def rand_prod_price_discount(num):
    prod_price_lst = []  # advertised product price
    prod_discnt_lst = []  # advertised product discount
    
    for i in range(num):
        prod_price_lst.append(random.randint(10, 100))
        prod_discnt_lst.append(random.randint(10, 100))
    
    return {'prod_price_lst': prod_price_lst, 'prod_discnt_lst': prod_discnt_lst}

def rand_prod_click_timestamp(stime, etime, num):
    prod_clik_tmstmp = []
    frmt = '%d-%m-%Y %H:%M:%S'
        
    for i in range(num):
        rtime = int(random.random()*86400)
    
        hours   = int(rtime/3600)
        minutes = int((rtime - hours*3600)/60)
        seconds = rtime - hours*3600 - minutes*60
    
        time_string = '%02d:%02d:%02d' % (hours, minutes, seconds)
        prod_clik_tmstmp.append(time_string)
        time_stmp = [item for item in prod_clik_tmstmp for i in range(num)]
        
    return {'prod_clik_tmstmp_lst':time_stmp}

def main():
    print('generating data...')
    # print('generating random geographic coordinates...')
    # get the impressions and click data
    impression = rand_clic_impr(num)
    clicks = rand_clic_impr(num)
    product_price = rand_prod_price_discount(num)
    product_discount = rand_prod_price_discount(num)
    prod_clik_tmstmp = rand_prod_click_timestamp("20-01-2018 13:30:00",
                                                 "23-01-2018 04:50:34",num)
    lst_dict = {"ad_loc": rand_shuf_loc(adv_loc, num),
                "prod": rand_shuf_prod(adv_prod, num),
                "imprsn": impression['rand_impr_lst'],
                "cliks": clicks['rand_click_lst'],
                "prod_price": product_price['prod_price_lst'],
                "prod_discnt": product_discount['prod_discnt_lst'],
                "prod_clik_stmp": prod_clik_tmstmp['prod_clik_tmstmp_lst']}
    fake_data = pd.DataFrame.from_dict(lst_dict, orient="index")
    res = fake_data.apply(lambda x: x.fillna(0)
                          if x.dtype.kind in 'biufc'
                          # where 'biufc' means boolean, integer,
                          # unicode, float & complex data types
                          else x.fillna(random.randint(0, 100)
                                        )
                          )
    print(res.transpose())
    res.to_csv("fake_data.csv", sep=",")

# invoke the main function
   
if __name__ == "__main__":
    main()

Problem 1

when I execute the above code snippet, it prints fine but when written to csv format, its horizontally positioned; i.e., it looks like this..wrong-data. How do I position it vertically when writing to csv file? What I want is 7 columns (see lst_dict variable above) with n number of rows?

Problem 2 I dont understand why the random date is generated for the first 50 columns and remaining columns are filled with numerical values?


Solution

  • To answer your first question, replace

    print(res.transpose())
    

    with

    res.transpose() print(res)  
    

    To answer your second question look at the length of the output of the method

    rand_shuf_loc() 
    

    it as well as the other helper functions only produce a list of 50 items.
    The creation of res using the method

    fake_data.apply  
    

    replaces all nan with a random numeric, so it also applies a numeric to the columns without any predefined values.