Search code examples
pythonpandasdataframefrequency

How to separate characters in a word list to find bigram frequency


I am trying to find the bigram frequencies of each sequence of sounds in a list of around 10,000 words. So far I'm able to get the bigram frequencies, but it's counting the sequence of two of the words in the list, not sounds in the words. Is there a way that I can indicate what the units that I want to be counting are?

Here's my python code:

from collections import Counter
import pandas from pd

CMU_data = pd.read_csv("CMU.csv")         #opening the csv file 
transcript = CMU_data["Transcription"]    #storing transcriptions column as a variable


def converter(x):                         #converting dataframe column from series to tuple
    if isinstance(x, pd.Series):
        return tuple(x.values)
    else:
        return x

transcript2 = transcript.apply(converter).unique() 
print(transcript2) 
                       

#finding bigrams

data = transcript2
bigrams = Counter(x+y for x, y in zip(*[data[i:] for i in range(2)]))
for bigram, count in bigrams.most_common():
    print(bigram, '=', count)

Here's a sample of what the current output looks like (the hashes indicate word boundaries):

# P OY1 N T # # S L AE1 SH #  = 1
# S L AE1 SH # # TH R IY1 D IY2 #  = 1
# TH R IY1 D IY2 # # K OW1 L AH0 N #  = 1
# K OW1 L AH0 N # # S EH1 M IY0 K OW1 L AH0 N #  = 1
# S EH1 M IY0 K OW1 L AH0 N # # S EH1 M IH0 K OW2 L AH0 N #  = 1
# S EH1 M IH0 K OW2 L AH0 N # # K W EH1 S CH AH0 N M AA1 R K #  = 1
# K W EH1 S CH AH0 N M AA1 R K # # AH0 #  = 1
# AH0 # # EY1 #  = 1
# EY1 # # EY1 Z #  = 1
# EY1 Z # # EY1 F AO1 R T UW1 W AH1 N T UW1 EY1 T #  = 1
(...)

Here is a sample of what I'm inputting (at it is converted to an array):

['# P OY1 N T # ' '# S L AE1 SH # ' '# TH R IY1 D IY2 # ' ...
 '# L EH1 F T B R EY1 S # ' '# OW1 P EH0 N B R EY1 S # '
 '# K L OW1 Z B R EY1 S # ']

I would like to get output that looks something like this:

TH R = 70
IY1 D = 100
IY2 # = 100 
# K  = 500
OW1 L = 100
AH0 N #  = 200
N # = 500

Solution

  • Here's one way to do it:

    from nltk.util import ngrams 
    from collections import Counter
    import pandas as pd
    
    
    inp = ['# P OY1 N T # ', '# S L AE1 SH # ', '# TH R IY1 D IY2 # ',
           '# L EH1 F T B R EY1 S # ', '# OW1 P EH0 N B R EY1 S # ',
           '# K L OW1 Z B R EY1 S # ']
    
    def tokenise(s):
        toks = s.strip().split(' ')
        # Join starting # with second element
        toks[0] = ' '.join(toks[:2])
        toks.pop(1)
        # Join penultimate element with end #
        toks[-1] = ' '.join(toks[-2:])
        toks.pop(-2)
        return toks
    
    def count_ngrams(tups,n):
        
        df = pd.DataFrame(Counter(tups).items(),columns=['bigram','count'])\
               .sort_values(by='count',ascending=False)\
               .reset_index(drop=True)
        
        return df
    
    def counts(inp,n,unit='sound'):
    
        if unit == 'sound':
            tokenised = [tokenise(s) for s in inp]
            # Create ngram tuples and flatten nested list
            tups = [item for sublist in [list(ngrams(t,n)) for t in tokenised] for item in sublist]
            
        elif unit == 'word':
            tups = list(ngrams(inp,n))
    
        return count_ngrams(tups,n) 
    

    Sound bigram counts

    counts(inp,2,unit='sound')
    
    #          bigram  count
    # 0    (EY1, S #)      3
    # 1      (R, EY1)      3
    # 2        (B, R)      3
    # 3    (# P, OY1)      1
    # 4        (T, B)      1
    # 5      (OW1, Z)      1
    # 6      (L, OW1)      1
    # 7      (# K, L)      1
    # 8        (N, B)      1
    # 9      (EH0, N)      1
    # 10     (P, EH0)      1
    # 11   (# OW1, P)      1
    # 12       (F, T)      1
    # 13     (OY1, N)      1
    # 14     (EH1, F)      1
    # 15   (# L, EH1)      1
    # 16   (D, IY2 #)      1
    # 17     (IY1, D)      1
    # 18     (R, IY1)      1
    # 19    (# TH, R)      1
    # 20  (AE1, SH #)      1
    # 21     (L, AE1)      1
    # 22     (# S, L)      1
    # 23     (N, T #)      1
    # 24       (Z, B)      1
    
    

    Word bigram count

    counts(inp,2,unit='word')
    
    #                                               bigram  count
    # 0                  (# P OY1 N T # , # S L AE1 SH # )      1
    # 1             (# S L AE1 SH # , # TH R IY1 D IY2 # )      1
    # 2    (# TH R IY1 D IY2 # , # L EH1 F T B R EY1 S # )      1
    # 3  (# L EH1 F T B R EY1 S # , # OW1 P EH0 N B R E...      1
    # 4  (# OW1 P EH0 N B R EY1 S # , # K L OW1 Z B R E...      1