I have a pandas dataframe containing json
data that I am attempting to normalize using pandas json_normalize
.
import pandas as pd
df = pd.DataFrame({
'id': [0, 1],
'j': [{'type': 's', 'loc': {'s': {'value': '0.0', 'text': 'xxx'}, 'ps': {'value': '0.0', 'text': 'xxx'}, 'g': {'value': '2.0', 'text': 'xxx'}, 'g':[]}}, {'type': 's', 'loc': {'s': {'value': '0.0', 'text': 'xxx'}, 'ps': {'value': '0.0', 'text': 'xxx'}, 'g': {'value': '2.0', 'text': 'xxx'}, 'g':[]}}]
})
dff = pd.json_normalize(df['j'], record_path=['loc'], meta=['id'])
dff
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
File <command-3958878855945685>:6
1 df = pd.DataFrame({
2 'id': [0, 1],
3 'j': [{'type': 's', 'loc': {'s': {'value': '0.0', 'text': 'xxx'}, 'ps': {'value': '0.0', 'text': 'xxx'}, 'g': {'value': '2.0', 'text': 'xxx'}, 'g':[]}}, {'type': 's', 'loc': {'s': {'value': '0.0', 'text': 'xxx'}, 'ps': {'value': '0.0', 'text': 'xxx'}, 'g': {'value': '2.0', 'text': 'xxx'}, 'g':[]}}]
4 })
----> 6 dff = pd.json_normalize(df['j'], record_path=['loc'], meta=['id'])
7 dff
File /databricks/python/lib/python3.9/site-packages/pandas/io/json/_normalize.py:515, in _json_normalize(data, record_path, meta, meta_prefix, record_prefix, errors, sep, max_level)
512 meta_vals[key].append(meta_val)
513 records.extend(recs)
--> 515 _recursive_extract(data, record_path, {}, level=0)
517 result = DataFrame(records)
519 if record_prefix is not None:
520 # Incompatible types in assignment (expression has type "Optional[DataFrame]",
521 # variable has type "DataFrame")
File /databricks/python/lib/python3.9/site-packages/pandas/io/json/_normalize.py:497, in _json_normalize.<locals>._recursive_extract(data, path, seen_meta, level)
495 else:
496 for obj in data:
--> 497 recs = _pull_records(obj, path[0])
498 recs = [
499 nested_to_record(r, sep=sep, max_level=max_level)
500 if isinstance(r, dict)
501 else r
502 for r in recs
503 ]
505 # For repeating the metadata later
File /databricks/python/lib/python3.9/site-packages/pandas/io/json/_normalize.py:427, in _json_normalize.<locals>._pull_records(js, spec)
425 result = []
426 else:
--> 427 raise TypeError(
428 f"{js} has non list value {result} for path {spec}. "
429 "Must be list or null."
430 )
431 return result
TypeError: {'type': 's', 'loc': {'s': {'value': '0.0', 'text': 'xxx'}, 'ps': {'value': '0.0', 'text': 'xxx'}, 'g': []}} has non list value {'s': {'value': '0.0', 'text': 'xxx'}, 'ps': {'value': '0.0', 'text': 'xxx'}, 'g': []} for path loc. Must be list or null.
Also, there might be some json data where the string would be an empty string. How would I handle that exception? Desired behavior is if the key
is not found, there normalize with np.nan
.
Expected output:
s.value s.text ps.value ps.text g.value g.text g.id id
0 0.0 xxx 0.0 xxx 2.0 xxx [0.0] 0
1 0.0 xxx 0.0 xxx 2.0 xxx [0.0] 1
You json is invalid, you should have a list in loc
and you als have a duplicated g
key. Also you can't use id
as meta as this key doesn't exist in the json.
With a correct format, this should give:
df = pd.DataFrame({
'id': [0, 1],
'j': [{'type': 's', 'loc': [{'s': {'value': '0.0', 'text': 'xxx'}, 'ps': {'value': '0.0', 'text': 'xxx'}, 'g': {'value': '2.0', 'text': 'xxx'}}]},
{'type': 's', 'loc': [{'s': {'value': '0.0', 'text': 'xxx'}, 'ps': {'value': '0.0', 'text': 'xxx'}, 'g': {'value': '2.0', 'text': 'xxx'}}]}
]
})
dff = pd.json_normalize(df['j'], record_path=['loc'])
Output:
s.value s.text ps.value ps.text g.value g.text
0 0.0 xxx 0.0 xxx 2.0 xxx
1 0.0 xxx 0.0 xxx 2.0 xxx
With your current format, assuming no duplication of g
you can use:
df = pd.DataFrame({
'id': [0, 1],
'j': [{'type': 's', 'loc': {'s': {'value': '0.0', 'text': 'xxx'}, 'ps': {'value': '0.0', 'text': 'xxx'}, 'g': {'value': '2.0', 'text': 'xxx'}}},
{'type': 's', 'loc': {'s': {'value': '0.0', 'text': 'xxx'}, 'ps': {'value': '0.0', 'text': 'xxx'}, 'g': {'value': '2.0', 'text': 'xxx'}}}]
})
pd.json_normalize(df['j'].str['loc'])
Output:
s.value s.text ps.value ps.text g.value g.text
0 0.0 xxx 0.0 xxx 2.0 xxx
1 0.0 xxx 0.0 xxx 2.0 xxx