Search code examples
pythonoptimizationdictionaryprofilingzip

Optimization of Function with Dictionary and zip()


I have the following function:

def filetxt():
    word_freq = {}
    lvl1      = []
    lvl2      = []
    total_t   = 0
    users     = 0
    text      = []

    for l in range(0,500):
        # Open File
        if os.path.exists("C:/Twitter/json/user_" + str(l) + ".json") == True:
            with open("C:/Twitter/json/user_" + str(l) + ".json", "r") as f:
                text_f = json.load(f)
                users = users + 1
                for i in range(len(text_f)):
                    text.append(text_f[str(i)]['text'])
                    total_t = total_t + 1
        else:
            pass

    # Filter
    occ = 0
    import string
    for i in range(len(text)):
        s = text[i] # Sample string
        a = re.findall(r'(RT)',s)
        b = re.findall(r'(@)',s)
        occ = len(a) + len(b) + occ
        s = s.encode('utf-8')
        out = s.translate(string.maketrans("",""), string.punctuation)


        # Create Wordlist/Dictionary
        word_list = text[i].lower().split(None)

        for word in word_list:
            word_freq[word] = word_freq.get(word, 0) + 1

        keys = word_freq.keys()

        numbo = range(1,len(keys)+1)
        WList = ', '.join(keys)
        NList = str(numbo).strip('[]')
        WList = WList.split(", ")
        NList = NList.split(", ")
        W2N = dict(zip(WList, NList))

        for k in range (0,len(word_list)):
            word_list[k] = W2N[word_list[k]]
        for i in range (0,len(word_list)-1):
            lvl1.append(word_list[i])
            lvl2.append(word_list[i+1])

I have used the profiler to find that it seems the greatest CPU time is spent on the zip() function and the join and split parts of the code, I'm looking to see if there is any way I have overlooked that I could potentially clean up the code to make it more optimized, since the greatest lag seems to be in how I am working with the dictionaries and the zip() function. Any help would be appreciated thanks!

p.s. The basic purpose of this function is that a I load in files which contain 20 or so tweets in them, so I am most likely going to end up with about 20k - 50k files being sent through this function. The output is that I produce a list of all the distinct words in a tweet, followed by which words linked to what, e.g:

1 "love"
2 "pasa"
3 "mirar"
4 "ants"
5 "kers"
6 "morir"
7 "dreaming"
8 "tan"
9 "rapido"
10 "one"
11 "much"
12 "la"
...
10 1
13 12
1 7
12 2
7 3
2 4
3 11
4 8
11 6
8 9
6 5
9 20
5 8
20 25
8 18
25 9
18 17
9 2
...

Solution

  • I think you want something like:

    import string
    from collections import defaultdict
    rng = xrange if xrange else range
    
    def filetxt():
        users     = 0
        total_t   = 0
        occ       = 0
    
        wordcount = defaultdict(int)
        wordpairs = defaultdict(lambda: defaultdict(int))
        for filenum in rng(500):
            try:
                with open("C:/Twitter/json/user_" + str(filenum) + ".json",'r') as inf:
                    users += 1
                    tweets = json.load(inf)
                    total_t += len(tweets)
    
                    for txt in (r['text'] for r in tweets):
                        occ += txt.count('RT') + txt.count('@')
                        prev = None
                        for word in txt.encode('utf-8').translate(None, string.punctuation).lower().split():
                            wordcount[word] += 1
                            wordpairs[prev][word] += 1
                            prev = word
            except IOError:
                pass