Search code examples
pythonpandasdata-analysisdefaultdictbigdata

Aggregating values in one column by their corresponding value in another from two files


had a question regarding summing the multiple values of duplicate keys into one key with the aggregate total. For example: 1:5 2:4 3:2 1:4 Very basic but I'm looking for an output that looks like: 1:9 2:4 3:2

In the two files I am using, I am dealing with a list of 51 users(column 1 of user_artists.dat) who have the artistID(column 2) and how many times that user has listened to that particular artist given by the weight(column 3).

I am attempting to aggregate the total times that artist has been played, across all users and display it in a format such as: Britney Spears (289) 2393140. Any help or input would be so appreciated.

import codecs
#from collections import defaultdict

with codecs.open("artists.dat", encoding = "utf-8") as f:
    artists = f.readlines()


with codecs.open("user_artists.dat", encoding = "utf-8") as f:
    users = f.readlines()


artist_list = [x.strip().split('\t') for x in artists][1:]
user_stats_list = [x.strip().split('\t') for x in users][1:]

artists = {}
for a in artist_list:
    artistID, name = a[0], a[1]
    artists[artistID] = name

grouped_user_stats = {}
for u in user_stats_list:
    userID, artistID, weight = u
    grouped_user_stats[artistID] = grouped_user_stats[artistID].astype(int)
    grouped_user_stats[weight] = grouped_user_stats[weight].astype(int)
    for artistID, weight in u:
        grouped_user_stats.groupby('artistID')['weight'].sum()
        print(grouped_user_stats.groupby('artistID')['weight'].sum())



    #if userID not in grouped_user_stats:
        #grouped_user_stats[userID] = { artistID: {'name': artists[artistID], 'plays': 1} }
    #else:
        #if artistID not in grouped_user_stats[userID]:
            #grouped_user_stats[userID][artistID] = {'name': artists[artistID], 'plays': 1}
        #else:
            #grouped_user_stats[userID][artistID]['plays'] += 1
            #print('this never happens') 




#print(grouped_user_stats)

Solution

  • how about:

    import codecs
    from collections import defaultdict
    # read stuff
    with codecs.open("artists.dat", encoding = "utf-8") as f:
        artists = f.readlines()
    with codecs.open("user_artists.dat", encoding = "utf-8") as f:
        users = f.readlines()
    # transform artist data in a dict with "artist id" as key and "artist name" as value
    artist_repo = dict(x.strip().split('\t')[:2] for x in artists[1:])
    
    user_stats_list = [x.strip().split('\t') for x in users][1:]
    
    grouped_user_stats = defaultdict(lambda:0)
    
    for u in user_stats_list:
        #userID, artistID, weight = u
        grouped_user_stats[u[0]] += int(u[2]) # accumulate weights in a dict with artist id as key and sum of wights as values
    # extra: "fancying" the data transforming the keys of the dict in "<artist name> (artist id)" format 
    grouped_user_stats = dict(("%s (%s)" % (artist_repo.get(k,"Unknown artist"), k), v) for k ,v in grouped_user_stats.iteritems() )
    # lastly print it
    for k, v in grouped_user_stats.iteritems():
       print k,v