How to separate characters in a word list to find bigram frequency

I am trying to find the bigram frequencies of each sequence of sounds in a list of around 10,000 words. So far I'm able to get the bigram frequencies, but it's counting the sequence of two of the words in the list, not sounds in the words. Is there a way that I can indicate what the units that I want to be counting are?

Here's my python code:

from collections import Counter
import pandas from pd

CMU_data = pd.read_csv("CMU.csv")         #opening the csv file 
transcript = CMU_data["Transcription"]    #storing transcriptions column as a variable


def converter(x):                         #converting dataframe column from series to tuple
    if isinstance(x, pd.Series):
        return tuple(x.values)
    else:
        return x

transcript2 = transcript.apply(converter).unique() 
print(transcript2) 
                       

#finding bigrams

data = transcript2
bigrams = Counter(x+y for x, y in zip(*[data[i:] for i in range(2)]))
for bigram, count in bigrams.most_common():
    print(bigram, '=', count)

Here's a sample of what the current output looks like (the hashes indicate word boundaries):

# P OY1 N T # # S L AE1 SH #  = 1
# S L AE1 SH # # TH R IY1 D IY2 #  = 1
# TH R IY1 D IY2 # # K OW1 L AH0 N #  = 1
# K OW1 L AH0 N # # S EH1 M IY0 K OW1 L AH0 N #  = 1
# S EH1 M IY0 K OW1 L AH0 N # # S EH1 M IH0 K OW2 L AH0 N #  = 1
# S EH1 M IH0 K OW2 L AH0 N # # K W EH1 S CH AH0 N M AA1 R K #  = 1
# K W EH1 S CH AH0 N M AA1 R K # # AH0 #  = 1
# AH0 # # EY1 #  = 1
# EY1 # # EY1 Z #  = 1
# EY1 Z # # EY1 F AO1 R T UW1 W AH1 N T UW1 EY1 T #  = 1
(...)

Here is a sample of what I'm inputting (at it is converted to an array):

['# P OY1 N T # ' '# S L AE1 SH # ' '# TH R IY1 D IY2 # ' ...
 '# L EH1 F T B R EY1 S # ' '# OW1 P EH0 N B R EY1 S # '
 '# K L OW1 Z B R EY1 S # ']

I would like to get output that looks something like this:

TH R = 70
IY1 D = 100
IY2 # = 100 
# K  = 500
OW1 L = 100
AH0 N #  = 200
N # = 500

Solution

Here's one way to do it:

from nltk.util import ngrams 
from collections import Counter
import pandas as pd


inp = ['# P OY1 N T # ', '# S L AE1 SH # ', '# TH R IY1 D IY2 # ',
       '# L EH1 F T B R EY1 S # ', '# OW1 P EH0 N B R EY1 S # ',
       '# K L OW1 Z B R EY1 S # ']

def tokenise(s):
    toks = s.strip().split(' ')
    # Join starting # with second element
    toks[0] = ' '.join(toks[:2])
    toks.pop(1)
    # Join penultimate element with end #
    toks[-1] = ' '.join(toks[-2:])
    toks.pop(-2)
    return toks

def count_ngrams(tups,n):
    
    df = pd.DataFrame(Counter(tups).items(),columns=['bigram','count'])\
           .sort_values(by='count',ascending=False)\
           .reset_index(drop=True)
    
    return df

def counts(inp,n,unit='sound'):

    if unit == 'sound':
        tokenised = [tokenise(s) for s in inp]
        # Create ngram tuples and flatten nested list
        tups = [item for sublist in [list(ngrams(t,n)) for t in tokenised] for item in sublist]
        
    elif unit == 'word':
        tups = list(ngrams(inp,n))

    return count_ngrams(tups,n)

Sound bigram counts

counts(inp,2,unit='sound')

#          bigram  count
# 0    (EY1, S #)      3
# 1      (R, EY1)      3
# 2        (B, R)      3
# 3    (# P, OY1)      1
# 4        (T, B)      1
# 5      (OW1, Z)      1
# 6      (L, OW1)      1
# 7      (# K, L)      1
# 8        (N, B)      1
# 9      (EH0, N)      1
# 10     (P, EH0)      1
# 11   (# OW1, P)      1
# 12       (F, T)      1
# 13     (OY1, N)      1
# 14     (EH1, F)      1
# 15   (# L, EH1)      1
# 16   (D, IY2 #)      1
# 17     (IY1, D)      1
# 18     (R, IY1)      1
# 19    (# TH, R)      1
# 20  (AE1, SH #)      1
# 21     (L, AE1)      1
# 22     (# S, L)      1
# 23     (N, T #)      1
# 24       (Z, B)      1

Word bigram count

counts(inp,2,unit='word')

#                                               bigram  count
# 0                  (# P OY1 N T # , # S L AE1 SH # )      1
# 1             (# S L AE1 SH # , # TH R IY1 D IY2 # )      1
# 2    (# TH R IY1 D IY2 # , # L EH1 F T B R EY1 S # )      1
# 3  (# L EH1 F T B R EY1 S # , # OW1 P EH0 N B R E...      1
# 4  (# OW1 P EH0 N B R EY1 S # , # K L OW1 Z B R E...      1