I have a dataframe of users coming from 2 postgresql tables that I need to insert into another database. In this one, there is a fixed number of columns. All additionnal information is to be saved in a "champs_addi" column. The champs_addi column is to be in jsonb format in ²postgresql.
For example I need to add the not null values of id_1 and id_2 skipping NA values to champs_addi in this :
import numpy as np
import pandas as pd
df = pd.DataFrame({
'nom': ['sireesha', 'ravi', 'rohith', 'pinkey', 'gnanesh'],
'prenom': ['mary', 'jack', 'mark', 'someone', 'anothersomeone'],
'id_1': [98, np.nan, 32, np.nan, 23],
'id_2': [1, 12, 78, np.nan, 87],
'champs_addi': [np.nan, np.nan, '{"statut": "actif"}', np.nan, np.nan],
})
#my id needs to be integers -> conversion
for col in df.columns[df.isna().any()].tolist():
if df[col].dtype == 'float':
df[col] = df[col].astype('Int64')
if all cells are null, I can use something like that :
df['champs_addi']='{"id_1": "'+df["id_1"].astype(str)+'"}'
I tried using iloc, for loop and other stuff but get lost in the multiple conditions. Any idea how to do that?
Try:
import json
def add_to_champs_addi(g):
id_1 = np.nan if pd.isna(g["id_1"]) else json.dumps({"id_1": str(int(g["id_1"]))})
id_2 = np.nan if pd.isna(g["id_2"]) else json.dumps({"id_2": str(int(g["id_2"]))})
ca = g["champs_addi"]
s = pd.Series([ca, id_1, id_2])
if s.isna().all():
return np.nan
return " ".join(s[s.notna()])
df["champs_addi"] = df.apply(add_to_champs_addi, axis=1)
print(df)
Prints:
nom prenom id_1 id_2 champs_addi
0 sireesha mary 98.0 1.0 {"id_1": "98"} {"id_2": "1"}
1 ravi jack NaN 12.0 {"id_2": "12"}
2 rohith mark 32.0 78.0 {"statut": "actif"} {"id_1": "32"} {"id_2": "78"}
3 pinkey someone NaN NaN NaN
4 gnanesh anothersomeone 23.0 87.0 {"id_1": "23"} {"id_2": "87"}
EDIT: With columns as a parameter:
import json
def add_to_champs_addi(g, columns):
cols = [
np.nan if pd.isna(g[c]) else json.dumps({c: str(int(g[c]))}) for c in columns
]
s = pd.Series([g["champs_addi"], *cols])
if s.isna().all():
return np.nan
return " ".join(s[s.notna()])
columns = ["id_1", "id_2"]
df["champs_addi"] = df.apply(add_to_champs_addi, columns=columns, axis=1)
print(df)