Search code examples
pythonpandasnumpyplotly

Creating an ndarray from ragged nested sequences is deprecated ... you must specify 'dtype=object' when creating the ndarray


I'm trying to read an Excel spreadsheet and create a graph based on the data from certain columns. I'm using Python 3.6.2 since some libraries used in my script, do not work well in the newer versions.

This is my code:

import os
import plotly
import plotly.graph_objects as go
import pandas as pd
import plotly.express as px

def create_graph(directoryPath):
    for root, subFolders, files in os.walk(directoryPath):
        for f in files:
            if "screenlog" in f.lower():
                print("Creating graphs from the Screen logs")
                absolutePathOfFile = os.path.join(root, f)
                pd.options.mode.chained_assignment = None  # default="warn"
                #PROCESSING THE EXCEL FILE
                df_init = pd.read_excel(absolutePathOfFile,engine="openpyxl")
                df_init["Screen&Action"]= df_init["SCREEN"].str.cat(df_init["Action_Name"], sep=" - ")
                ##### PLOTING Scatter chart######
                print("Plotting the scatter graphs")
                #using plotly to create a scatter graph
                fig_scatter= px.scatter(df_init, x="INSTANT", y="DURATION",color="Screen&Action",height=500,title="Traditional Screen Analysis")
                fig_scatter.update_layout(showlegend=False,title={       
                        "y":0.9,
                        "x":0.5,
                        "xanchor": "center",
                        "yanchor": "top"})
        fig_scatter.show()

The error message:

> Creating graphs from the Screen logs
> ...\AppData\Local\Programs\Python\Python36\lib\site-packages\numpy\core\fromnumeric.py:87:
> VisibleDeprecationWarning:
> 
> Creating an ndarray from ragged nested sequences (which is a
> list-or-tuple of lists-or-tuples-or ndarrays with different lengths or
> shapes) is deprecated. If you meant to do this, you must specify
> 'dtype=object' when creating the ndarray
> 
> Plotting the scatter graphs Traceback (most recent call last):   File
> "...\Documents\log_app\log_parser.py", line 3812, in <module>
>     scripts.screen.create_graph(directoryPath)   File "...\Documents\log_app\scripts\screen.py", line 20, in create_graph
>     fig_scatter= px.scatter(df_init, x="INSTANT", y="DURATION",color="Screen&Action",height=500,title="Traditional
> Screen Analysis")   File
> "...\AppData\Local\Programs\Python\Python36\lib\site-packages\plotly\express\_chart_types.py",
> line 66, in scatter
>     return make_figure(args=locals(), constructor=go.Scatter)   File "...\AppData\Local\Programs\Python\Python36\lib\site-packages\plotly\express\_core.py",
> line 1988, in make_figure
>     group = grouped.get_group(group_name if len(group_name) > 1 else group_name[0])   File
> "...\AppData\Local\Programs\Python\Python36\lib\site-packages\pandas\core\groupby\groupby.py",
> line 680, in get_group
>     raise KeyError(name) KeyError: (nan, '', '', '', '')

I spent some time searching for a workaround, and I found several sites that mentioned adding "dtype=object" as an extra parameter should fix the problem.

However, how can I add "dtype=object" in this line?:

fig_scatter= px.scatter(df_init, x="INSTANT", y="DURATION",color="Screen&Action",height=500,title="Traditional Screen Analysis")

I always get a syntax error.

Dataframe:

print(df_init)

       Tenant_Id  ...                                    Screen&Action
0              1  ...      Screen_Logs - TelemetryClickEvent.SendEvent
1              1  ...                      Screen_Logs - FilterOrReset
2              1  ...  Cyclic_Job_Logs - TelemetryClickEvent.SendEvent
3              1  ...                        Screen_Logs - Preparation
4             20  ...                 EventoDetalhe_New - Load_Atletas
...          ...  ...                                              ...
31893         20  ...                              Login - Preparation
31894          1  ...                              Login - Preparation
31895         20  ...                              Login - Preparation
31896          1  ...                              Login - Preparation
31897         20  ...                              Login - Preparation

[31898 rows x 20 columns]

print(df_init.head(10).to_dict("records"))

[{'Tenant_Id': 1, 'INSTANT': Timestamp('2021-11-25 21:44:15.345000'), 
'DURATION': 14, 'SCREEN': 'Screen_Logs', 'Session_Id': 'N30TTfz+v0G6OuDKYznMeA==', 
'User_Id': 1263, 'Espace_Id': 1, 'MSISDN': nan, 'Screen_Type': 'WEB', 
'Executed_By': 'E3Q3J-PR4U18', 'Session_Bytes': 8207, 'Viewstate_Bytes': 4204, 
'Session_Requests': 1, 'Access_Mode': 'Ajax', 'Request_Key': 'ac581f0c-ae8a-4f1d-8eea-39c0f89171b7', 
'Action_Name': 'TelemetryClickEvent.SendEvent', 'Espace_Name': 'ServiceCenter', 
'Application_Name': 'Service Center', 'Application_Key': '463836d2-9aea-42ff-9f58-a1e78e163c11', 
'Screen&Action': 'Screen_Logs - TelemetryClickEvent.SendEvent'}, 
{'Tenant_Id': 1, 'INSTANT': Timestamp('2021-11-25 21:44:15.202000'), 
'DURATION': 31, 'SCREEN': 'Screen_Logs', 'Session_Id': 'N30TTfz+v0G6OuDKYznMeA==', 
'User_Id': 1263, 'Espace_Id': 1, 'MSISDN': nan, 'Screen_Type': 'WEB', 'Executed_By': 'E3Q3J-PR4U18', 
'Session_Bytes': 8207, 'Viewstate_Bytes': 4184, 'Session_Requests': 1, 'Access_Mode': 'Ajax', 
'Request_Key': 'd4f00718-3fbe-416d-8540-aeed02f371b3', 'Action_Name': 'FilterOrReset', 
'Espace_Name': 'ServiceCenter', 'Application_Name': 'Service Center', 
'Application_Key': '463836d2-9aea-42ff-9f58-a1e78e163c11', 'Screen&Action': 
'Screen_Logs - FilterOrReset'}, {'Tenant_Id': 1, 'INSTANT': Timestamp('2021-11-25 21:44:11.733000'), 
'DURATION': 15, 'SCREEN': 'Cyclic_Job_Logs', 'Session_Id': 'N30TTfz+v0G6OuDKYznMeA==', 
'User_Id': 1263, 'Espace_Id': 1, 'MSISDN': nan, 'Screen_Type': 'WEB', 
'Executed_By': 'E3Q3J-PR4U18', 'Session_Bytes': 8093, 'Viewstate_Bytes': 4160, 
'Session_Requests': 1, 'Access_Mode': 'Ajax', 'Request_Key': 'a35aa5c7-81e9-487a-8df3-474b12df2a1a', 
'Action_Name': 'TelemetryClickEvent.SendEvent', 'Espace_Name': 'ServiceCenter', 
'Application_Name': 'Service Center', 'Application_Key': '463836d2-9aea-42ff-9f58-a1e78e163c11', 
'Screen&Action': 'Cyclic_Job_Logs - TelemetryClickEvent.SendEvent'}, 
{'Tenant_Id': 1, 'INSTANT': Timestamp('2021-11-25 21:44:11.733000'), 'DURATION': 797, 
'SCREEN': 'Screen_Logs', 'Session_Id': 'N30TTfz+v0G6OuDKYznMeA==', 'User_Id': 1263, 
'Espace_Id': 1, 'MSISDN': nan, 'Screen_Type': 'WEB', 'Executed_By': 'E3Q3J-PR4U18', 
'Session_Bytes': 8093, 'Viewstate_Bytes': 0, 'Session_Requests': 1, 
'Access_Mode': 'Screen', 'Request_Key': '4ac69574-86e2-4b85-86eb-e8cda703193c', 
'Action_Name': 'Preparation', 'Espace_Name': 'ServiceCenter', 'Application_Name': 'Service Center', 
'Application_Key': '463836d2-9aea-42ff-9f58-a1e78e163c11', 'Screen&Action': 'Screen_Logs - Preparation'}, 
{'Tenant_Id': 20, 'INSTANT': Timestamp('2021-11-25 21:44:05.108000'), 'DURATION': 1359, 
'SCREEN': 'EventoDetalhe_New', 'Session_Id': 'jXC8xqSk1Eu0q4C+VQNZww==', 'User_Id': 151, 
'Espace_Id': 80, 'MSISDN': nan, 'Screen_Type': 'WEB', 'Executed_By': 'E3Q3J-PR4U18', 
'Session_Bytes': 17444, 'Viewstate_Bytes': 45784, 'Session_Requests': 4, 'Access_Mode': 'Ajax', 
'Request_Key': '9f62097f-fe6a-468a-a4d4-5b2c39e295ce', 'Action_Name': 'Load_Atletas', 
'Espace_Name': 'TalentWeb', 'Application_Name': 'TalentWeb', 
'Application_Key': '448b995f-e07e-4a21-a442-12204cfffea6', 
'Screen&Action': 'EventoDetalhe_New - Load_Atletas'}, 
{'Tenant_Id': 20, 'INSTANT': Timestamp('2021-11-25 21:44:04.573000'), 'DURATION': 33, 
'SCREEN': 'EventoDetalhe_New', 'Session_Id': 'jXC8xqSk1Eu0q4C+VQNZww==', 'User_Id': 151, 
'Espace_Id': 80, 'MSISDN': nan, 'Screen_Type': 'WEB', 'Executed_By': 'E3Q3J-PR4U1S', 
'Session_Bytes': 12431, 'Viewstate_Bytes': 45824, 'Session_Requests': 1, 'Access_Mode': 'Ajax', 
'Request_Key': '61d1d2a1-2c56-4e1d-a752-8d1719aa77a8', 'Action_Name': 'Tabs.OnChange', 
'Espace_Name': 'TalentWeb', 'Application_Name': 'TalentWeb', 
'Application_Key': '448b995f-e07e-4a21-a442-12204cfffea6', 'Screen&Action': 'EventoDetalhe_New - Tabs.OnChange'},
 {'Tenant_Id': 1, 'INSTANT': Timestamp('2021-11-25 21:44:02.658000'), 'DURATION': 31, 
'SCREEN': 'Login', 'Session_Id': 'am9Hoz4suEifYkaRsh8usQ==', 'User_Id': 0, 'Espace_Id': 1, 
'MSISDN': nan, 'Screen_Type': 'WEB', 'Executed_By': 'E3Q3J-PR4U2H', 
'Session_Bytes': 0, 'Viewstate_Bytes': 0, 'Session_Requests': 1, 'Access_Mode': 'Screen', 
'Request_Key': '1b4f4776-f622-46da-ac0f-88a897c5b8a7', 'Action_Name': 'Preparation', 
'Espace_Name': 'ServiceCenter', 'Application_Name': 'Service Center', 
'Application_Key': '463836d2-9aea-42ff-9f58-a1e78e163c11', 'Screen&Action': 'Login - Preparation'}, 
{'Tenant_Id': 20, 'INSTANT': Timestamp('2021-11-25 21:43:59.729000'), 'DURATION': 314, 
'SCREEN': 'EventoDetalhe_New', 'Session_Id': 'jXC8xqSk1Eu0q4C+VQNZww==', 'User_Id': 151, 
'Espace_Id': 80, 'MSISDN': nan, 'Screen_Type': 'WEB', 'Executed_By': 'E3Q3J-PR4U1S', 
'Session_Bytes': 14891, 'Viewstate_Bytes': 0, 'Session_Requests': 3, 
'Access_Mode': 'Screen', 'Request_Key': '866f27bb-f86a-4add-b084-f433e1cadc73', 
'Action_Name': 'Preparation', 'Espace_Name': 'TalentWeb', 'Application_Name': 'TalentWeb', 
'Application_Key': '448b995f-e07e-4a21-a442-12204cfffea6', 'Screen&Action': 'EventoDetalhe_New - Preparation'}, 
{'Tenant_Id': 1, 'INSTANT': Timestamp('2021-11-25 21:43:53.594000'), 'DURATION': 1904, 
'SCREEN': 'Cyclic_Job_Logs', 'Session_Id': 'N30TTfz+v0G6OuDKYznMeA==', 'User_Id': 1263, 
'Espace_Id': 1, 'MSISDN': nan, 'Screen_Type': 'WEB', 'Executed_By': 'E3Q3J-PR4U18', 
'Session_Bytes': 8093, 'Viewstate_Bytes': 4160, 'Session_Requests': 1, 
'Access_Mode': 'Screen', 'Request_Key': '26f2d266-b2fc-4a2f-8551-046ba9604477', 
'Action_Name': 'ExportToExcel', 'Espace_Name': 'ServiceCenter', 'Application_Name': 'Service Center', 
'Application_Key': '463836d2-9aea-42ff-9f58-a1e78e163c11', 
'Screen&Action': 'Cyclic_Job_Logs - ExportToExcel'}, 
{'Tenant_Id': 1, 'INSTANT': Timestamp('2021-11-25 21:43:53.594000'), 'DURATION': 2060, 
'SCREEN': 'Cyclic_Job_Logs', 'Session_Id': 'N30TTfz+v0G6OuDKYznMeA==', 'User_Id': 1263, 
'Espace_Id': 1, 'MSISDN': nan, 'Screen_Type': 'WEB', 'Executed_By': 'E3Q3J-PR4U18', 
'Session_Bytes': 8093, 'Viewstate_Bytes': 4160, 'Session_Requests': 1, 
'Access_Mode': 'Ajax', 'Request_Key': '66d521dd-905d-4f66-ac99-a982bcccb99b', 
'Action_Name': 'TelemetryClickEvent.SendEvent', 'Espace_Name': 'ServiceCenter', 
'Application_Name': 'Service Center', 'Application_Key': '463836d2-9aea-42ff-9f58-a1e78e163c11', 
'Screen&Action': 'Cyclic_Job_Logs - TelemetryClickEvent.SendEvent'}]

print(df_init[df_init['Screen&Action'].isnull()])

Tenant_Id                 INSTANT  DURATION  ... Application_Name                       Application_Key  Screen&Action
25708         20 2021-11-24 13:45:47.022        93  ...   Statvue Emails  f256c73c-722e-4adb-b2c1-c80cc20c2745            NaN
25763         20 2021-11-24 13:39:20.798         6  ...   Statvue Emails  f256c73c-722e-4adb-b2c1-c80cc20c2745            NaN
25782         20 2021-11-24 13:38:06.664       171  ...   Statvue Emails  f256c73c-722e-4adb-b2c1-c80cc20c2745            NaN
25805         20 2021-11-24 13:36:15.512      2295  ...   Statvue Emails  f256c73c-722e-4adb-b2c1-c80cc20c2745            NaN

The Excel file I'm trying to read.

Any help is appreciated.


Solution

    • using sample 10 rows you provided. Replicated the error by setting SCREEN to nan for two rows
    • this is a data quality issue (I'm using python 3.9 and plotly 5.5.0)
    • two columns have this potential issue as you concatenate them together
    • have worked around by replacing nan with fillna() in target concatenated column
    # df_init = pd.read_excel(absolutePathOfFile, engine="openpyxl")
    
    # simulate a couple of missing values
    df_init.loc[df_init.sample(2).index,"SCREEN"] = np.nan
    df_init.loc[df_init.sample(2).index,"Action_Name"] = ""
    
    df_init["Screen&Action"] = df_init["SCREEN"].str.cat(df_init["Action_Name"], sep=" - ")
    ##### PLOTING Scatter chart######
    print("Plotting the scatter graphs")
    # using plotly to create a scatter graph
    try:
        fig_scatter = px.scatter(
            df_init,
            x="INSTANT",
            y="DURATION",
            color="Screen&Action",
            height=500,
            title="Traditional Screen Analysis",
        )
    except KeyError:
        print("failed, defaulting")
        df_init["Screen&Action"] = df_init["Screen&Action"].fillna("unknown")
        fig_scatter = px.scatter(
            df_init,
            x="INSTANT",
            y="DURATION",
            color="Screen&Action",
            height=500,
            title="Traditional Screen Analysis",
        )
    
        
    fig_scatter.update_layout(
        showlegend=False, title={"y": 0.9, "x": 0.5, "xanchor": "center", "yanchor": "top"}
    )
    fig_scatter.show()