Search code examples
pythonpandasmachine-learningscikit-learnsklearn-pandas

Assign custom categories to json data - pandas


Assigning labels to raw data instead of getting new indicator columns from get_dummies. I want something like this :

json_input:

[{id:100,vehicle_type:"Car", time:"2017-04-06 01:39:43", zone="A", type:"Checked"}, {id:101,vehicle_type:"Truck", time:"2017-04-06 02:35:45", zone="B", type:"Unchecked"}, {id:102,vehicle_type:"Truck", time:"2017-04-05 03:20:12", zone="A", type:"Checked"}, {id:103,vehicle_type:"Car", time:"2017-04-04 10:05:04", zone="C", type:"Unchecked"} ]

Result:

  • id , vehicle_type, time_range, zone, type
  • 100, 0 , 1 , 1 , 1
  • 101, 1 , 1 , 2 , 0
  • 102, 1 , 2 , 1 , 1
  • 103, 0 , 3 , 3 , 0

time stamp- TS columns -> vehicle_type, type are binary , time_range (1 -> (TS1-TS2),2 -> (TS3-TS4), 3->(TS5-TS6)), zone-> categorical(1,2 or 3). I want to auto assign these labels when I feed the flattened json to dataframe in pandas. Is this possible? ( I do not want zone_1, type_1, vehicle_type_3 indicator columns from get_dummies in pandas). If not possible with pandas, please suggest python lib for this automation.


Solution

  • Here is what I could come up with. I do not know what time ranges you are looking for

    import datetime
    import io
    import pandas as pd
    import numpy as np
    df_string='[{"id":100,"vehicle_type":"Car","time":"2017-04-06 01:39:43","zone":"A","type":"Checked"},{"id":101,"vehicle_type":"Truck","time":"2017-04-06 02:35:45","zone":"B","type":"Unchecked"},{"id":102,"vehicle_type":"Truck","time":"2017-04-05 03:20:12","zone":"A","type":"Checked"},{"id":103,"vehicle_type":"Car","time":"2017-04-04 10:05:04","zone":"C","type":"Unchecked"}]'
    df = pd.read_json(io.StringIO(df_string))
    df['zone'] = pd.Categorical(df.zone)
    df['vehicle_type'] = pd.Categorical(df.vehicle_type)
    df['type'] = pd.Categorical(df.type)
    df['zone_int'] = df.zone.cat.codes
    df['vehicle_type_int'] = df.vehicle_type.cat.codes
    df['type_int'] = df.type.cat.codes
    df.head()
    

    Edit Here is what I could come up with

    import datetime
    import io
    import math
    import pandas as pd
    #Taken from http://stackoverflow.com/questions/13071384/python-ceil-a-datetime-to-next-quarter-of-an-hour
    def ceil_dt(dt, num_seconds=900):
        nsecs = dt.minute*60 + dt.second + dt.microsecond*1e-6  
        delta = math.ceil(nsecs / num_seconds) * num_seconds - nsecs
        return dt + datetime.timedelta(seconds=delta)
    
    df_string='[{"id":100,"vehicle_type":"Car","time":"2017-04-06 01:39:43","zone":"A","type":"Checked"},{"id":101,"vehicle_type":"Truck","time":"2017-04-06 02:35:45","zone":"B","type":"Unchecked"},{"id":102,"vehicle_type":"Truck","time":"2017-04-05 03:20:12","zone":"A","type":"Checked"},{"id":103,"vehicle_type":"Car","time":"2017-04-04 10:05:04","zone":"C","type":"Unchecked"}]'
    df = pd.read_json(io.StringIO(df_string))
    df['zone'] = pd.Categorical(df.zone)
    df['vehicle_type'] = pd.Categorical(df.vehicle_type)
    df['type'] = pd.Categorical(df.type)
    df['zone_int'] = df.zone.cat.codes
    df['vehicle_type_int'] = df.vehicle_type.cat.codes
    df['type_int'] = df.type.cat.codes
    df['time'] = pd.to_datetime(df.time)
    df['dayofweek'] = df.time.dt.dayofweek
    df['month_int'] = df.time.dt.month
    df['year_int'] = df.time.dt.year
    df['day'] = df.time.dt.day
    df['date'] = df.time.apply(lambda x: x.date())
    df['month'] = df.date.apply(lambda x: datetime.date(x.year, x.month, 1))
    df['year'] = df.date.apply(lambda x: datetime.date(x.year, 1, 1))
    df['hour'] = df.time.dt.hour
    df['mins']  = df.time.dt.minute
    df['seconds'] = df.time.dt.second
    df['time_interval_3hour'] = df.hour.apply(lambda x : math.floor(x/3)+1)
    df['time_interval_6hour'] = df.hour.apply(lambda x : math.floor(x/6)+1)
    df['time_interval_12hour'] = df.hour.apply(lambda x : math.floor(x/12)+1)
    df['weekend']  = df.dayofweek.apply(lambda x:  x>4)
    
    df['ceil_quarter_an_hour'] =df.time.apply(lambda x : ceil_dt(x))
    df['ceil_half_an_hour'] =df.time.apply(lambda x : ceil_dt(x, num_seconds=1800))
    df.head()