Search code examples
pythonjsondictionarynestedpython-itertools

Comparing list with JSON and counting occurences


Given lists as follows:

make = ['ford', 'fiat', 'nissan', 'suzuki', 'dacia']
model = ['x', 'y', 'z']
version = ['A', 'B', 'C']
typ = ['sedan', 'coupe', 'van', 'kombi']
infos = ['steering wheel problems', 'gearbox problems', 'broken engine', 'throttle problems', None]

total.append(make)
total.append(model)
total.append(version)
total.append(typ)
total.append(infos)

I need to create list of lists of all possible combinations of these lists so I did:

combos = list(itertools.product(*total))
all_combos = [list(elem) for elem in combos]

Now I want to compare, find in a JSON object items with the same set of values as occurs in item of all_combos and count number of these occurences. My JSON is large and looks a bit like:

data = [
{  'make': 'dacia'
   'model': 'x',
   'version': 'A',
   'typ': 'sedan',
   'infos': 'steering wheel problems'
}, ...]

I want to get output like:

output = [
    {  'make': 'dacia'
       'model': 'x',
       'version': 'A',
       'typ': 'sedan',
       'infos': 'steering wheel problems',
       'number_of_occurences_of_such_combination_of_fields_with__such_values': 75
    }, ...]

How to resolve such task?


Solution

  • If I understand you correctly, you want to add to each dictionary in your data key number_of_occurences_of_such_combination_of_fields_with__such_values:

    from operator import itemgetter
    from itertools import product
    
    make = ["ford", "fiat", "nissan", "suzuki", "dacia"]
    model = ["x", "y", "z"]
    version = ["A", "B", "C"]
    typ = ["sedan", "coupe", "van", "kombi"]
    infos = [
        "steering wheel problems",
        "gearbox problems",
        "broken engine",
        "throttle problems",
        None,
    ]
    
    total = [make, model, version, typ, infos]
    
    data = [
        {
            "make": "dacia",
            "model": "x",
            "version": "A",
            "typ": "sedan",
            "infos": "steering wheel problems",
        },
        {
            "make": "dacia",
            "model": "x",
            "version": "A",
            "typ": "sedan",
            "infos": "steering wheel problems",
        },
        {
            "make": "ford",
            "model": "x",
            "version": "A",
            "typ": "sedan",
            "infos": "steering wheel problems",
        },
    ]
    
    i = itemgetter("make", "model", "version", "typ", "infos")
    
    cnt = {}
    for c in itertools.product(*total):
        for d in data:
            if i(d) == c:
                cnt.setdefault(c, []).append(d)
    
    for k, v in cnt.items():
        for d in v:
            d[
                "number_of_occurences_of_such_combination_of_fields_with__such_values"
            ] = len(v)
    
    print(data)
    

    Prints:

    [
        {
            "make": "dacia",
            "model": "x",
            "version": "A",
            "typ": "sedan",
            "infos": "steering wheel problems",
            "number_of_occurences_of_such_combination_of_fields_with__such_values": 2,
        },
        {
            "make": "dacia",
            "model": "x",
            "version": "A",
            "typ": "sedan",
            "infos": "steering wheel problems",
            "number_of_occurences_of_such_combination_of_fields_with__such_values": 2,
        },
        {
            "make": "ford",
            "model": "x",
            "version": "A",
            "typ": "sedan",
            "infos": "steering wheel problems",
            "number_of_occurences_of_such_combination_of_fields_with__such_values": 1,
        },
    ]
    

    Version 2: (without itertools.product):

    from operator import itemgetter
    
    
    i = itemgetter("make", "model", "version", "typ", "infos")
    
    cnt = {}
    for d in data:
        c = i(d)
        cnt[c] = cnt.get(c, 0) + 1
    
    for d in data:
        d[
            "number_of_occurences_of_such_combination_of_fields_with__such_values"
        ] = cnt[i(d)]
    
    print(data)