I am trying to find the bigram frequencies of each sequence of sounds in a list of around 10,000 words. So far I'm able to get the bigram frequencies, but it's counting the sequence of two of the words in the list, not sounds in the words. Is there a way that I can indicate what the units that I want to be counting are?
Here's my python code:
from collections import Counter
import pandas from pd
CMU_data = pd.read_csv("CMU.csv") #opening the csv file
transcript = CMU_data["Transcription"] #storing transcriptions column as a variable
def converter(x): #converting dataframe column from series to tuple
if isinstance(x, pd.Series):
return tuple(x.values)
else:
return x
transcript2 = transcript.apply(converter).unique()
print(transcript2)
#finding bigrams
data = transcript2
bigrams = Counter(x+y for x, y in zip(*[data[i:] for i in range(2)]))
for bigram, count in bigrams.most_common():
print(bigram, '=', count)
Here's a sample of what the current output looks like (the hashes indicate word boundaries):
# P OY1 N T # # S L AE1 SH # = 1
# S L AE1 SH # # TH R IY1 D IY2 # = 1
# TH R IY1 D IY2 # # K OW1 L AH0 N # = 1
# K OW1 L AH0 N # # S EH1 M IY0 K OW1 L AH0 N # = 1
# S EH1 M IY0 K OW1 L AH0 N # # S EH1 M IH0 K OW2 L AH0 N # = 1
# S EH1 M IH0 K OW2 L AH0 N # # K W EH1 S CH AH0 N M AA1 R K # = 1
# K W EH1 S CH AH0 N M AA1 R K # # AH0 # = 1
# AH0 # # EY1 # = 1
# EY1 # # EY1 Z # = 1
# EY1 Z # # EY1 F AO1 R T UW1 W AH1 N T UW1 EY1 T # = 1
(...)
Here is a sample of what I'm inputting (at it is converted to an array):
['# P OY1 N T # ' '# S L AE1 SH # ' '# TH R IY1 D IY2 # ' ...
'# L EH1 F T B R EY1 S # ' '# OW1 P EH0 N B R EY1 S # '
'# K L OW1 Z B R EY1 S # ']
I would like to get output that looks something like this:
TH R = 70
IY1 D = 100
IY2 # = 100
# K = 500
OW1 L = 100
AH0 N # = 200
N # = 500
Here's one way to do it:
from nltk.util import ngrams
from collections import Counter
import pandas as pd
inp = ['# P OY1 N T # ', '# S L AE1 SH # ', '# TH R IY1 D IY2 # ',
'# L EH1 F T B R EY1 S # ', '# OW1 P EH0 N B R EY1 S # ',
'# K L OW1 Z B R EY1 S # ']
def tokenise(s):
toks = s.strip().split(' ')
# Join starting # with second element
toks[0] = ' '.join(toks[:2])
toks.pop(1)
# Join penultimate element with end #
toks[-1] = ' '.join(toks[-2:])
toks.pop(-2)
return toks
def count_ngrams(tups,n):
df = pd.DataFrame(Counter(tups).items(),columns=['bigram','count'])\
.sort_values(by='count',ascending=False)\
.reset_index(drop=True)
return df
def counts(inp,n,unit='sound'):
if unit == 'sound':
tokenised = [tokenise(s) for s in inp]
# Create ngram tuples and flatten nested list
tups = [item for sublist in [list(ngrams(t,n)) for t in tokenised] for item in sublist]
elif unit == 'word':
tups = list(ngrams(inp,n))
return count_ngrams(tups,n)
Sound bigram counts
counts(inp,2,unit='sound')
# bigram count
# 0 (EY1, S #) 3
# 1 (R, EY1) 3
# 2 (B, R) 3
# 3 (# P, OY1) 1
# 4 (T, B) 1
# 5 (OW1, Z) 1
# 6 (L, OW1) 1
# 7 (# K, L) 1
# 8 (N, B) 1
# 9 (EH0, N) 1
# 10 (P, EH0) 1
# 11 (# OW1, P) 1
# 12 (F, T) 1
# 13 (OY1, N) 1
# 14 (EH1, F) 1
# 15 (# L, EH1) 1
# 16 (D, IY2 #) 1
# 17 (IY1, D) 1
# 18 (R, IY1) 1
# 19 (# TH, R) 1
# 20 (AE1, SH #) 1
# 21 (L, AE1) 1
# 22 (# S, L) 1
# 23 (N, T #) 1
# 24 (Z, B) 1
Word bigram count
counts(inp,2,unit='word')
# bigram count
# 0 (# P OY1 N T # , # S L AE1 SH # ) 1
# 1 (# S L AE1 SH # , # TH R IY1 D IY2 # ) 1
# 2 (# TH R IY1 D IY2 # , # L EH1 F T B R EY1 S # ) 1
# 3 (# L EH1 F T B R EY1 S # , # OW1 P EH0 N B R E... 1
# 4 (# OW1 P EH0 N B R EY1 S # , # K L OW1 Z B R E... 1