1. Installing Libraries
In [ ]:
!pip install pyenchant contractions g2p-en cmudict jiwer
import nltk
!pip install -U nltk
nltk.download('averaged_perceptron_tagger')
nltk.download('punkt')
nltk.download('cmudict')
!pip install enchant
Requirement already satisfied: pyenchant in /usr/local/lib/python3.10/dist-packages (3.2.2) Requirement already satisfied: contractions in /usr/local/lib/python3.10/dist-packages (0.1.73) Requirement already satisfied: g2p-en in /usr/local/lib/python3.10/dist-packages (2.1.0) Requirement already satisfied: cmudict in /usr/local/lib/python3.10/dist-packages (1.0.13) Requirement already satisfied: jiwer in /usr/local/lib/python3.10/dist-packages (3.0.3) Requirement already satisfied: textsearch>=0.0.21 in /usr/local/lib/python3.10/dist-packages (from contractions) (0.0.24) Requirement already satisfied: numpy>=1.13.1 in /usr/local/lib/python3.10/dist-packages (from g2p-en) (1.23.5) Requirement already satisfied: nltk>=3.2.4 in /usr/local/lib/python3.10/dist-packages (from g2p-en) (3.8.1) Requirement already satisfied: inflect>=0.3.1 in /usr/local/lib/python3.10/dist-packages (from g2p-en) (7.0.0) Requirement already satisfied: distance>=0.1.3 in /usr/local/lib/python3.10/dist-packages (from g2p-en) (0.1.3) Requirement already satisfied: importlib-metadata<6.0.0,>=5.1.0 in /usr/local/lib/python3.10/dist-packages (from cmudict) (5.2.0) Requirement already satisfied: importlib-resources<6.0.0,>=5.10.1 in /usr/local/lib/python3.10/dist-packages (from cmudict) (5.13.0) Requirement already satisfied: click<9.0.0,>=8.1.3 in /usr/local/lib/python3.10/dist-packages (from jiwer) (8.1.7) Requirement already satisfied: rapidfuzz<4,>=3 in /usr/local/lib/python3.10/dist-packages (from jiwer) (3.3.0) Requirement already satisfied: zipp>=0.5 in /usr/local/lib/python3.10/dist-packages (from importlib-metadata<6.0.0,>=5.1.0->cmudict) (3.16.2) Requirement already satisfied: pydantic>=1.9.1 in /usr/local/lib/python3.10/dist-packages (from inflect>=0.3.1->g2p-en) (1.10.12) Requirement already satisfied: typing-extensions in /usr/local/lib/python3.10/dist-packages (from inflect>=0.3.1->g2p-en) (4.5.0) Requirement already satisfied: joblib in /usr/local/lib/python3.10/dist-packages (from nltk>=3.2.4->g2p-en) (1.3.2) Requirement already satisfied: regex>=2021.8.3 in /usr/local/lib/python3.10/dist-packages (from nltk>=3.2.4->g2p-en) (2023.6.3) Requirement already satisfied: tqdm in /usr/local/lib/python3.10/dist-packages (from nltk>=3.2.4->g2p-en) (4.66.1) Requirement already satisfied: anyascii in /usr/local/lib/python3.10/dist-packages (from textsearch>=0.0.21->contractions) (0.3.2) Requirement already satisfied: pyahocorasick in /usr/local/lib/python3.10/dist-packages (from textsearch>=0.0.21->contractions) (2.0.0) Requirement already satisfied: nltk in /usr/local/lib/python3.10/dist-packages (3.8.1) Requirement already satisfied: click in /usr/local/lib/python3.10/dist-packages (from nltk) (8.1.7) Requirement already satisfied: joblib in /usr/local/lib/python3.10/dist-packages (from nltk) (1.3.2) Requirement already satisfied: regex>=2021.8.3 in /usr/local/lib/python3.10/dist-packages (from nltk) (2023.6.3) Requirement already satisfied: tqdm in /usr/local/lib/python3.10/dist-packages (from nltk) (4.66.1)
[nltk_data] Downloading package averaged_perceptron_tagger to [nltk_data] /root/nltk_data... [nltk_data] Package averaged_perceptron_tagger is already up-to- [nltk_data] date! [nltk_data] Downloading package punkt to /root/nltk_data... [nltk_data] Package punkt is already up-to-date! [nltk_data] Downloading package cmudict to /root/nltk_data... [nltk_data] Package cmudict is already up-to-date!
Requirement already satisfied: enchant in /usr/local/lib/python3.10/dist-packages (0.0.1)
In [ ]:
from google.colab import drive
drive.mount('/content/drive')
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
In [ ]:
from pathlib import Path
import re
import pandas as pd
import numpy as np
import random
import string
from collections import Counter
import matplotlib.pyplot as plt
from collections import defaultdict
from wordcloud import WordCloud
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import cmudict
import enchant
from g2p_en import G2p
import inflect
import contractions
import seaborn as sns
from re import match
from multiprocessing import Pool
from nltk.corpus import cmudict
import contractions
from itertools import chain
from IPython.display import display
import tensorflow as tf
from tensorflow.keras.layers import Input, Embedding, GRU, Dense, Dropout, Attention
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.utils import plot_model
import jiwer
from nltk.translate.bleu_score import corpus_bleu, SmoothingFunction
In [ ]:
# Define the file path
path = Path("/content/drive/MyDrive/Dissertation/g_train.txt")
# Read the file content
with open(path, 'r', encoding='utf-8') as file:
lines = file.readlines()
# Number of lines in the dataset
num_lines = len(lines)
print(f"Number of lines in the dataset are: {num_lines}")
# Display first few lines
print("\nThe first 5 lines are:")
for line in lines[:5]:
print(line.strip())
# Total number of words in the dataset
total_words = sum(len(line.split()) for line in lines)
print(f"\nThe Total number of words are: {total_words}")
# Average number of words per line
avg_words_per_line = total_words / num_lines
print(f"The average number of words per line are: {avg_words_per_line:.2f}")
Number of lines in the dataset are: 45839 The first 5 lines are: 5535415699068794046/00001, WHEN YOU'RE COOKING CHIPS AT HOME 5535415699068794046/00002, THE TRADITIONAL CHIP PAN OFTEN STAYS ON THE SHELF 5535415699068794046/00003, THROUGH WHAT THEY CALL A KNIFE BLOCK 5535415699068794046/00004, WHICH INVOLVES FIRING A POTATO DOWN A PIPE 5535415699068794046/00006, APART FROM THE GOLDEN COLOUR AND THE DELICIOUS FLAVOUR The Total number of words are: 375019 The average number of words per line are: 8.18
In [ ]:
# Create inflect engine
p = inflect.engine()
# Function to convert numbers to words
def convert_numbers_to_words(text):
words_with_numbers = []
words = text.split()
for i, word in enumerate(words):
if word.isdigit():
words_with_numbers.append((word, p.number_to_words(word)))
words[i] = words_with_numbers[-1][1]
return ' '.join(words), words_with_numbers
# Function to replace hyphens with spaces
def replace_hyphens_with_spaces(text):
return text.replace('-', ' ')
lines = []
numbers_converted = []
with open(path, encoding='utf-8') as file:
for line in file.readlines():
cleaned_line, numbers = convert_numbers_to_words(re.sub(r'^\d+/\d+,\s*', '', line).strip())
lines.append(cleaned_line)
numbers_converted.extend(numbers)
# Convert to DataFrame and process
df = pd.DataFrame(lines, columns=['sentence'])
df['sentence'] = df['sentence'].apply(str.lstrip)
df['sentence'] = df['sentence'].apply(replace_hyphens_with_spaces)
# Print converted numbers
print("\nNumbers converted to words:")
for number, words in numbers_converted:
print(f"{number}: {words.replace('-', ' ')}")
# Additional insights
print("\nSome additional insights:")
print("Average sentence length:", df['sentence'].str.split().apply(len).mean())
print("Max sentence length:", df['sentence'].str.split().apply(len).max())
print("Min sentence length:", df['sentence'].str.split().apply(len).min())
print("Unique words:", len(set(' '.join(df['sentence']).split())))
print("\nShape:", df.shape)
# Print DataFrame
print(df.head())
Numbers converted to words:
4: four
24: twenty four
1972: one thousand, nine hundred and seventy two
1: one
20: twenty
2: two
2012: two thousand and twelve
110: one hundred and ten
6: six
14: fourteen
31: thirty one
1964: one thousand, nine hundred and sixty four
1965: one thousand, nine hundred and sixty five
69: sixty nine
1: one
1966: one thousand, nine hundred and sixty six
67: sixty seven
1: one
10: ten
230: two hundred and thirty
1968: one thousand, nine hundred and sixty eight
1969: one thousand, nine hundred and sixty nine
80: eighty
17: seventeen
50: fifty
60: sixty
200: two hundred
180: one hundred and eighty
10: ten
20: twenty
195: one hundred and ninety five
249: two hundred and forty nine
300: three hundred
500: five hundred
350: three hundred and fifty
25: twenty five
20: twenty
65: sixty five
10: ten
2005: two thousand and five
000: zero
50: fifty
1: one
40: forty
60: sixty
23: twenty three
5: five
10: ten
10: ten
000: zero
180: one hundred and eighty
54: fifty four
1: one
300: three hundred
45: forty five
50: fifty
5: five
20: twenty
80: eighty
10: ten
100: one hundred
3: three
10: ten
21: twenty one
21: twenty one
400: four hundred
5: five
40: forty
35: thirty five
5: five
1: one
15: fifteen
250: two hundred and fifty
15: fifteen
25: twenty five
27: twenty seven
60: sixty
4: four
500: five hundred
6: six
6: six
20: twenty
68: sixty eight
7: seven
24: twenty four
2: two
3: three
2000: two thousand
40: forty
39: thirty nine
1940: one thousand, nine hundred and forty
75: seventy five
000: zero
100: one hundred
1: one
800: eight hundred
108: one hundred and eight
012: twelve
2012: two thousand and twelve
20: twenty
10: ten
85: eighty five
31: thirty one
300: three hundred
9: nine
3: three
150: one hundred and fifty
20: twenty
500: five hundred
000: zero
100: one hundred
1734: one thousand, seven hundred and thirty four
147: one hundred and forty seven
85: eighty five
55: fifty five
14: fourteen
15: fifteen
80: eighty
000: zero
90: ninety
200: two hundred
25: twenty five
300: three hundred
75: seventy five
10: ten
5: five
22: twenty two
150: one hundred and fifty
45: forty five
2001: two thousand and one
10: ten
6: six
300: three hundred
12: twelve
185: one hundred and eighty five
80: eighty
14: fourteen
30: thirty
40: forty
29: twenty nine
14: fourteen
300: three hundred
1: one
810: eight hundred and ten
18: eighteen
18: eighteen
40: forty
60: sixty
35: thirty five
000: zero
000: zero
9: nine
15: fifteen
17: seventeen
12: twelve
25: twenty five
30: thirty
21: twenty one
300: three hundred
10: ten
2009: two thousand and nine
45: forty five
30: thirty
59: fifty nine
7: seven
24: twenty four
5: five
000: zero
30: thirty
86: eighty six
32: thirty two
10: ten
5: five
70: seventy
100: one hundred
50: fifty
28: twenty eight
65: sixty five
45: forty five
30: thirty
40: forty
000: zero
1642: one thousand, six hundred and forty two
65: sixty five
12: twelve
1527: one thousand, five hundred and twenty seven
150: one hundred and fifty
250: two hundred and fifty
60: sixty
1900: one thousand, nine hundred
140: one hundred and forty
10: ten
12: twelve
13: thirteen
35: thirty five
12: twelve
40: forty
50: fifty
50: fifty
50: fifty
9: nine
5: five
1: one
125: one hundred and twenty five
5: five
28: twenty eight
24: twenty four
000: zero
18: eighteen
78: seventy eight
100: one hundred
10: ten
100: one hundred
18: eighteen
25: twenty five
1979: one thousand, nine hundred and seventy nine
33: thirty three
100: one hundred
100: one hundred
50: fifty
80: eighty
20: twenty
30: thirty
1: one
1709: one thousand, seven hundred and nine
1710: one thousand, seven hundred and ten
30: thirty
50: fifty
400: four hundred
218: two hundred and eighteen
97: ninety seven
24: twenty four
28: twenty eight
4: four
520: five hundred and twenty
32: thirty two
2013: two thousand and thirteen
2013: two thousand and thirteen
10: ten
160: one hundred and sixty
250: two hundred and fifty
2012: two thousand and twelve
78: seventy eight
2010: two thousand and ten
1980: one thousand, nine hundred and eighty
600: six hundred
500: five hundred
1: one
000: zero
30: thirty
400: four hundred
10: ten
200: two hundred
27: twenty seven
5: five
14: fourteen
90: ninety
25: twenty five
20: twenty
41: forty one
25: twenty five
100: one hundred
150: one hundred and fifty
20: twenty
30: thirty
25: twenty five
60: sixty
41: forty one
40: forty
47: forty seven
20: twenty
50: fifty
60: sixty
50: fifty
400: four hundred
1: one
20: twenty
140: one hundred and forty
1: one
80: eighty
20: twenty
95: ninety five
18: eighteen
500: five hundred
40: forty
50: fifty
500: five hundred
1: one
2015: two thousand and fifteen
100: one hundred
2: two
190: one hundred and ninety
400: four hundred
5: five
10: ten
50: fifty
5: five
36: thirty six
000: zero
67: sixty seven
100: one hundred
300: three hundred
125: one hundred and twenty five
30: thirty
11: eleven
16: sixteen
24: twenty four
2: two
4: four
3: three
30: thirty
2010: two thousand and ten
2010: two thousand and ten
2011: two thousand and eleven
1: one
20: twenty
40: forty
40: forty
8: eight
30: thirty
1947: one thousand, nine hundred and forty seven
60: sixty
5: five
60: sixty
12: twelve
12: twelve
1: one
12: twelve
1: one
12: twelve
2: two
2: two
12: twelve
12: twelve
12: twelve
2: two
100: one hundred
50: fifty
100: one hundred
1: one
12: twelve
000: zero
12: twelve
6: six
12: twelve
2000: two thousand
12: twelve
999: nine hundred and ninety nine
14: fourteen
12: twelve
000: zero
4: four
12: twelve
1: one
12: twelve
12: twelve
14: fourteen
12: twelve
000: zero
500: five hundred
500: five hundred
000: zero
10: ten
12: twelve
1: one
12: twelve
12: twelve
300: three hundred
500: five hundred
7: seven
000: zero
12: twelve
12: twelve
2: two
12: twelve
2011: two thousand and eleven
000: zero
14: fourteen
000: zero
63: sixty three
64: sixty four
180: one hundred and eighty
30: thirty
11: eleven
35: thirty five
8: eight
600: six hundred
95: ninety five
000: zero
450: four hundred and fifty
475: four hundred and seventy five
5: five
37: thirty seven
31: thirty one
35: thirty five
65: sixty five
000: zero
20: twenty
1835: one thousand, eight hundred and thirty five
54: fifty four
67: sixty seven
000: zero
000: zero
32: thirty two
000: zero
30: thirty
40: forty
58: fifty eight
000: zero
960: nine hundred and sixty
24: twenty four
27: twenty seven
000: zero
32: thirty two
89: eighty nine
70: seventy
48: forty eight
000: zero
62: sixty two
58: fifty eight
000: zero
12: twelve
69: sixty nine
160: one hundred and sixty
145: one hundred and forty five
500: five hundred
100: one hundred
93: ninety three
500: five hundred
30: thirty
50: fifty
1: one
13: thirteen
200: two hundred
000: zero
6: six
600: six hundred
63: sixty three
400: four hundred
60: sixty
1: one
1: one
000: zero
1: one
1: one
300: three hundred
25: twenty five
000: zero
24: twenty four
90: ninety
110: one hundred and ten
1935: one thousand, nine hundred and thirty five
30: thirty
50: fifty
800: eight hundred
100: one hundred
150: one hundred and fifty
250: two hundred and fifty
22: twenty two
200: two hundred
35: thirty five
2012: two thousand and twelve
1851: one thousand, eight hundred and fifty one
10: ten
24: twenty four
12: twelve
1943: one thousand, nine hundred and forty three
60: sixty
617: six hundred and seventeen
1533: one thousand, five hundred and thirty three
200: two hundred
1537: one thousand, five hundred and thirty seven
56: fifty six
26: twenty six
14: fourteen
2012: two thousand and twelve
20: twenty
30: thirty
21: twenty one
99: ninety nine
15: fifteen
12: twelve
900: nine hundred
1: one
1940: one thousand, nine hundred and forty
53: fifty three
400: four hundred
40: forty
1: one
2: two
52: fifty two
28: twenty eight
25: twenty five
48: forty eight
2008: two thousand and eight
60: sixty
75: seventy five
80: eighty
8: eight
5: five
100: one hundred
62: sixty two
12: twelve
11: eleven
371: three hundred and seventy one
371: three hundred and seventy one
000: zero
11: eleven
80: eighty
23: twenty three
40: forty
000: zero
6: six
2003: two thousand and three
93: ninety three
6: six
20: twenty
2003: two thousand and three
205: two hundred and five
69: sixty nine
33: thirty three
000: zero
59: fifty nine
56: fifty six
000: zero
40: forty
15: fifteen
70: seventy
50: fifty
60: sixty
29: twenty nine
45: forty five
80: eighty
100: one hundred
40: forty
8: eight
30: thirty
1: one
12: twelve
000: zero
55: fifty five
000: zero
10: ten
135: one hundred and thirty five
160: one hundred and sixty
110: one hundred and ten
90: ninety
15: fifteen
150: one hundred and fifty
16: sixteen
170: one hundred and seventy
20: twenty
54: fifty four
195: one hundred and ninety five
10: ten
450: four hundred and fifty
20: twenty
450: four hundred and fifty
400: four hundred
1: one
12: twelve
8: eight
200: two hundred
2: two
1: one
2010: two thousand and ten
25: twenty five
46: forty six
000: zero
62: sixty two
59: fifty nine
1995: one thousand, nine hundred and ninety five
000: zero
51: fifty one
56: fifty six
000: zero
000: zero
24: twenty four
66: sixty six
36: thirty six
57: fifty seven
000: zero
200: two hundred
1: one
76: seventy six
40: forty
20: twenty
16: sixteen
2012: two thousand and twelve
25: twenty five
50: fifty
14: fourteen
2013: two thousand and thirteen
61: sixty one
22: twenty two
11: eleven
66: sixty six
67: sixty seven
1846: one thousand, eight hundred and forty six
75: seventy five
120: one hundred and twenty
75: seventy five
5: five
5: five
11: eleven
3: three
50: fifty
2013: two thousand and thirteen
20: twenty
40: forty
3: three
35: thirty five
24: twenty four
12: twelve
10: ten
18: eighteen
18: eighteen
65: sixty five
100: one hundred
50: fifty
57: fifty seven
300: three hundred
50: fifty
8: eight
75: seventy five
30: thirty
50: fifty
28: twenty eight
10: ten
20: twenty
400: four hundred
500: five hundred
150: one hundred and fifty
7: seven
18: eighteen
26: twenty six
1984: one thousand, nine hundred and eighty four
462: four hundred and sixty two
2014: two thousand and fourteen
72: seventy two
1820: one thousand, eight hundred and twenty
2: two
1754: one thousand, seven hundred and fifty four
10: ten
10: ten
60: sixty
75: seventy five
94: ninety four
50: fifty
100: one hundred
370: three hundred and seventy
40: forty
50: fifty
51: fifty one
200: two hundred
1: one
70: seventy
100: one hundred
200: two hundred
1893: one thousand, eight hundred and ninety three
1991: one thousand, nine hundred and ninety one
100: one hundred
25: twenty five
100: one hundred
30: thirty
10: ten
1: one
200: two hundred
300: three hundred
4: four
120: one hundred and twenty
180: one hundred and eighty
230: two hundred and thirty
1946: one thousand, nine hundred and forty six
20: twenty
200: two hundred
400: four hundred
200: two hundred
500: five hundred
800: eight hundred
10: ten
20: twenty
200: two hundred
70: seventy
10: ten
30: thirty
10: ten
2: two
150: one hundred and fifty
31: thirty one
30: thirty
50: fifty
20: twenty
30: thirty
40: forty
10: ten
500: five hundred
12: twelve
30: thirty
25: twenty five
30: thirty
139: one hundred and thirty nine
25: twenty five
10: ten
20: twenty
638: six hundred and thirty eight
13: thirteen
10: ten
12: twelve
12: twelve
28: twenty eight
35: thirty five
60: sixty
100: one hundred
5: five
40: forty
12: twelve
25: twenty five
25: twenty five
9: nine
5: five
15: fifteen
25: twenty five
000: zero
150: one hundred and fifty
100: one hundred
150: one hundred and fifty
100: one hundred
150: one hundred and fifty
30: thirty
45: forty five
930: nine hundred and thirty
9: nine
4: four
1: one
48: forty eight
400: four hundred
2: two
707: seven hundred and seven
707: seven hundred and seven
50: fifty
2: two
50: fifty
2: two
350: three hundred and fifty
14: fourteen
24: twenty four
25: twenty five
30: thirty
12: twelve
50: fifty
2: two
975: nine hundred and seventy five
300: three hundred
200: two hundred
2: two
600: six hundred
000: zero
5: five
12: twelve
45: forty five
120: one hundred and twenty
800: eight hundred
1: one
38: thirty eight
1: one
2012: two thousand and twelve
1: one
30: thirty
50: fifty
60: sixty
5: five
300: three hundred
200: two hundred
1947: one thousand, nine hundred and forty seven
52: fifty two
90: ninety
2000: two thousand
40: forty
100: one hundred
000: zero
95: ninety five
40: forty
14: fourteen
2: two
79: seventy nine
45: forty five
100: one hundred
500: five hundred
600: six hundred
100: one hundred
2: two
194: one hundred and ninety four
15: fifteen
1779: one thousand, seven hundred and seventy nine
60: sixty
25: twenty five
200: two hundred
250: two hundred and fifty
30: thirty
600: six hundred
100: one hundred
30: thirty
50: fifty
80: eighty
65: sixty five
25: twenty five
20: twenty
180: one hundred and eighty
250: two hundred and fifty
50: fifty
5: five
000: zero
10: ten
200: two hundred
1800: one thousand, eight hundred
1910: one thousand, nine hundred and ten
10: ten
12: twelve
75: seventy five
100: one hundred
15: fifteen
70: seventy
60: sixty
99: ninety nine
45: forty five
5: five
000: zero
40: forty
60: sixty
12: twelve
15: fifteen
400: four hundred
600: six hundred
1: one
853: eight hundred and fifty three
46: forty six
500: five hundred
25: twenty five
1888: one thousand, eight hundred and eighty eight
250: two hundred and fifty
10: ten
5: five
1958: one thousand, nine hundred and fifty eight
85: eighty five
600: six hundred
800: eight hundred
2006: two thousand and six
15: fifteen
200: two hundred
300: three hundred
38: thirty eight
28: twenty eight
100: one hundred
60: sixty
300: three hundred
300: three hundred
75: seventy five
1899: one thousand, eight hundred and ninety nine
300: three hundred
2: two
200: two hundred
21: twenty one
100: one hundred
40: forty
125: one hundred and twenty five
24: twenty four
50: fifty
23: twenty three
13: thirteen
10: ten
120: one hundred and twenty
50: fifty
38: thirty eight
60: sixty
30: thirty
150: one hundred and fifty
12: twelve
2: two
32: thirty two
2012: two thousand and twelve
800: eight hundred
1907: one thousand, nine hundred and seven
50: fifty
50: fifty
35: thirty five
25: twenty five
1: one
90: ninety
450: four hundred and fifty
900: nine hundred
400: four hundred
500: five hundred
750: seven hundred and fifty
58: fifty eight
370: three hundred and seventy
42: forty two
10: ten
150: one hundred and fifty
50: fifty
1901: one thousand, nine hundred and one
5: five
10: ten
30: thirty
20: twenty
4: four
1985: one thousand, nine hundred and eighty five
175: one hundred and seventy five
12: twelve
11: eleven
60: sixty
12: twelve
100: one hundred
100: one hundred
500: five hundred
11: eleven
100: one hundred
300: three hundred
50: fifty
48: forty eight
30: thirty
1924: one thousand, nine hundred and twenty four
160: one hundred and sixty
80: eighty
24: twenty four
15: fifteen
600: six hundred
2: two
1: one
20: twenty
40: forty
50: fifty
15: fifteen
10: ten
10: ten
25: twenty five
70: seventy
20: twenty
100: one hundred
200: two hundred
15: fifteen
34: thirty four
20: twenty
20: twenty
26: twenty six
3: three
32: thirty two
20: twenty
5: five
60: sixty
800: eight hundred
100: one hundred
20: twenty
65: sixty five
50: fifty
20: twenty
10: ten
150: one hundred and fifty
2: two
63: sixty three
33: thirty three
100: one hundred
000: zero
000: zero
500: five hundred
90: ninety
50: fifty
20: twenty
1900: one thousand, nine hundred
10: ten
200: two hundred
17: seventeen
30: thirty
24: twenty four
120: one hundred and twenty
100: one hundred
100: one hundred
1: one
120: one hundred and twenty
27: twenty seven
1934: one thousand, nine hundred and thirty four
673: six hundred and seventy three
29: twenty nine
30: thirty
6: six
600: six hundred
200: two hundred
62: sixty two
100: one hundred
3: three
180: one hundred and eighty
142: one hundred and forty two
100: one hundred
1958: one thousand, nine hundred and fifty eight
25: twenty five
16: sixteen
300: three hundred
400: four hundred
000: zero
30: thirty
10: ten
12: twelve
24: twenty four
300: three hundred
40: forty
80: eighty
400: four hundred
8: eight
200: two hundred
300: three hundred
800: eight hundred
12: twelve
000: zero
20: twenty
7: seven
0: zero
40: forty
40: forty
75: seventy five
20: twenty
4: four
4: four
29: twenty nine
1770: one thousand, seven hundred and seventy
000: zero
627: six hundred and twenty seven
465: four hundred and sixty five
375: three hundred and seventy five
385: three hundred and eighty five
3: three
2011: two thousand and eleven
1942: one thousand, nine hundred and forty two
60: sixty
7: seven
18: eighteen
000: zero
2001: two thousand and one
11: eleven
8: eight
5: five
180: one hundred and eighty
30: thirty
75: seventy five
1993: one thousand, nine hundred and ninety three
1978: one thousand, nine hundred and seventy eight
18: eighteen
20: twenty
3: three
1: one
24: twenty four
20: twenty
24: twenty four
150: one hundred and fifty
15: fifteen
1850: one thousand, eight hundred and fifty
1035: one thousand and thirty five
100: one hundred
000: zero
20: twenty
1887: one thousand, eight hundred and eighty seven
26: twenty six
15: fifteen
155: one hundred and fifty five
30: thirty
240: two hundred and forty
15: fifteen
115: one hundred and fifteen
10: ten
1: one
10: ten
94: ninety four
24: twenty four
2: two
2: two
300: three hundred
450: four hundred and fifty
65: sixty five
100: one hundred
100: one hundred
195: one hundred and ninety five
300: three hundred
165: one hundred and sixty five
37: thirty seven
1: one
28: twenty eight
1814: one thousand, eight hundred and fourteen
200: two hundred
100: one hundred
75: seventy five
100: one hundred
450: four hundred and fifty
32: thirty two
20: twenty
246: two hundred and forty six
270: two hundred and seventy
400: four hundred
125: one hundred and twenty five
380: three hundred and eighty
125: one hundred and twenty five
200: two hundred
250: two hundred and fifty
21: twenty one
1: one
450: four hundred and fifty
40: forty
15: fifteen
170: one hundred and seventy
15: fifteen
700: seven hundred
9: nine
200: two hundred
400: four hundred
75: seventy five
600: six hundred
300: three hundred
170: one hundred and seventy
10: ten
2014: two thousand and fourteen
000: zero
3: three
000: zero
30: thirty
1948: one thousand, nine hundred and forty eight
15: fifteen
50: fifty
20: twenty
18: eighteen
18: eighteen
32: thirty two
100: one hundred
1: one
500: five hundred
1338: one thousand, three hundred and thirty eight
12: twelve
24: twenty four
000: zero
10: ten
1545: one thousand, five hundred and forty five
25: twenty five
2008: two thousand and eight
500: five hundred
112: one hundred and twelve
16: sixteen
35: thirty five
500: five hundred
10: ten
4: four
700: seven hundred
17: seventeen
17: seventeen
2: two
8: eight
17: seventeen
1: one
100: one hundred
2014: two thousand and fourteen
515: five hundred and fifteen
210: two hundred and ten
125: one hundred and twenty five
550: five hundred and fifty
625: six hundred and twenty five
430: four hundred and thirty
150: one hundred and fifty
15: fifteen
75: seventy five
190: one hundred and ninety
133: one hundred and thirty three
295: two hundred and ninety five
37: thirty seven
000: zero
65: sixty five
425: four hundred and twenty five
147: one hundred and forty seven
8: eight
10: ten
62: sixty two
110: one hundred and ten
15: fifteen
20: twenty
20: twenty
21: twenty one
14: fourteen
3: three
25: twenty five
30: thirty
100: one hundred
110: one hundred and ten
800: eight hundred
200: two hundred
000: zero
20: twenty
1: one
400: four hundred
12: twelve
24: twenty four
6000: six thousand
135: one hundred and thirty five
20: twenty
30: thirty
50: fifty
15: fifteen
40: forty
000: zero
1905: one thousand, nine hundred and five
100: one hundred
3: three
100: one hundred
100: one hundred
50: fifty
2014: two thousand and fourteen
1984: one thousand, nine hundred and eighty four
2014: two thousand and fourteen
14: fourteen
87: eighty seven
3: three
20: twenty
65: sixty five
170: one hundred and seventy
11: eleven
2001: two thousand and one
96: ninety six
20: twenty
1945: one thousand, nine hundred and forty five
1965: one thousand, nine hundred and sixty five
1739: one thousand, seven hundred and thirty nine
28: twenty eight
161: one hundred and sixty one
35: thirty five
000: zero
14: fourteen
70: seventy
10: ten
20: twenty
40: forty
1606: one thousand, six hundred and six
10: ten
20: twenty
25: twenty five
300: three hundred
500: five hundred
200: two hundred
1996: one thousand, nine hundred and ninety six
95: ninety five
18: eighteen
59: fifty nine
23: twenty three
18: eighteen
12: twelve
21: twenty one
1976: one thousand, nine hundred and seventy six
25: twenty five
9: nine
458: four hundred and fifty eight
40: forty
11: eleven
15: fifteen
11: eleven
75: seventy five
15: fifteen
15: fifteen
20: twenty
1: one
10: ten
20: twenty
40: forty
85: eighty five
35: thirty five
5: five
1949: one thousand, nine hundred and forty nine
20: twenty
40: forty
11: eleven
70: seventy
30: thirty
000: zero
50: fifty
000: zero
65: sixty five
2016: two thousand and sixteen
28: twenty eight
100: one hundred
150: one hundred and fifty
2050: two thousand and fifty
100: one hundred
1963: one thousand, nine hundred and sixty three
20: twenty
80: eighty
100: one hundred
20: twenty
17: seventeen
000: zero
300: three hundred
46: forty six
700: seven hundred
1527: one thousand, five hundred and twenty seven
82: eighty two
60: sixty
18: eighteen
20: twenty
55: fifty five
1958: one thousand, nine hundred and fifty eight
16: sixteen
13: thirteen
12: twelve
000: zero
100: one hundred
14: fourteen
30: thirty
155: one hundred and fifty five
50: fifty
22: twenty two
000: zero
000: zero
40: forty
100: one hundred
150: one hundred and fifty
52: fifty two
11: eleven
24: twenty four
300: three hundred
400: four hundred
30: thirty
40: forty
30: thirty
27: twenty seven
46: forty six
300: three hundred
10: ten
4: four
7: seven
40: forty
5: five
5: five
80: eighty
80: eighty
150: one hundred and fifty
76: seventy six
20: twenty
2013: two thousand and thirteen
40: forty
20: twenty
930: nine hundred and thirty
1800: one thousand, eight hundred
680: six hundred and eighty
500: five hundred
80: eighty
90: ninety
95: ninety five
50: fifty
11: eleven
24: twenty four
300: three hundred
3: three
000: zero
89: eighty nine
20: twenty
40: forty
5: five
191: one hundred and ninety one
17: seventeen
1963: one thousand, nine hundred and sixty three
40: forty
60: sixty
40: forty
50: fifty
4: four
96: ninety six
0: zero
150: one hundred and fifty
180: one hundred and eighty
5: five
200: two hundred
300: three hundred
26: twenty six
11: eleven
15: fifteen
20: twenty
200: two hundred
120: one hundred and twenty
1: one
20: twenty
90: ninety
100: one hundred
200: two hundred
300: three hundred
100: one hundred
10: ten
20: twenty
82: eighty two
20: twenty
200: two hundred
100: one hundred
200: two hundred
100: one hundred
30: thirty
60: sixty
200: two hundred
200: two hundred
3: three
97: ninety seven
5: five
1: one
5: five
85: eighty five
40: forty
15: fifteen
20: twenty
2: two
4: four
2: two
500: five hundred
20: twenty
20: twenty
25: twenty five
30: thirty
27: twenty seven
000: zero
50: fifty
12: twelve
1: one
12: twelve
1: one
12: twelve
30: thirty
1: one
2: two
000: zero
520: five hundred and twenty
30: thirty
18: eighteen
18: eighteen
16: sixteen
18: eighteen
15: fifteen
20: twenty
7: seven
460: four hundred and sixty
1: one
30: thirty
3: three
2: two
12: twelve
2022: two thousand and twenty two
63: sixty three
3: three
15: fifteen
10: ten
2010: two thousand and ten
1923: one thousand, nine hundred and twenty three
11: eleven
25: twenty five
21: twenty one
100: one hundred
140: one hundred and forty
150: one hundred and fifty
200: two hundred
90: ninety
75: seventy five
20: twenty
16: sixteen
1723: one thousand, seven hundred and twenty three
150: one hundred and fifty
100: one hundred
200: two hundred
50: fifty
100: one hundred
90: ninety
200: two hundred
300: three hundred
30: thirty
24: twenty four
5: five
000: zero
000: zero
1: one
70: seventy
2: two
84: eighty four
1988: one thousand, nine hundred and eighty eight
15: fifteen
150: one hundred and fifty
29: twenty nine
38: thirty eight
1991: one thousand, nine hundred and ninety one
40: forty
55: fifty five
15: fifteen
20: twenty
47: forty seven
28: twenty eight
30: thirty
25: twenty five
98: ninety eight
230: two hundred and thirty
11: eleven
30: thirty
28: twenty eight
1605: one thousand, six hundred and five
40: forty
16: sixteen
80: eighty
120: one hundred and twenty
8: eight
60: sixty
20: twenty
30: thirty
1989: one thousand, nine hundred and eighty nine
20: twenty
18: eighteen
1987: one thousand, nine hundred and eighty seven
1926: one thousand, nine hundred and twenty six
13: thirteen
14: fourteen
250: two hundred and fifty
150: one hundred and fifty
400: four hundred
500: five hundred
120: one hundred and twenty
40: forty
40: forty
7: seven
9: nine
12: twelve
30: thirty
90: ninety
15: fifteen
150: one hundred and fifty
100: one hundred
180: one hundred and eighty
000: zero
175: one hundred and seventy five
150: one hundred and fifty
1828: one thousand, eight hundred and twenty eight
70: seventy
56: fifty six
10: ten
15: fifteen
20: twenty
130: one hundred and thirty
15: fifteen
20: twenty
12: twelve
330: three hundred and thirty
400: four hundred
100: one hundred
100: one hundred
100: one hundred
1911: one thousand, nine hundred and eleven
10: ten
340: three hundred and forty
75: seventy five
60: sixty
1828: one thousand, eight hundred and twenty eight
70: seventy
14: fourteen
15: fifteen
1651: one thousand, six hundred and fifty one
95: ninety five
20: twenty
50: fifty
60: sixty
520: five hundred and twenty
750: seven hundred and fifty
1948: one thousand, nine hundred and forty eight
800: eight hundred
20: twenty
25: twenty five
25: twenty five
60: sixty
500: five hundred
000: zero
58: fifty eight
20: twenty
750: seven hundred and fifty
90: ninety
10: ten
15: fifteen
1: one
12: twelve
000: zero
000: zero
20: twenty
24: twenty four
34: thirty four
729: seven hundred and twenty nine
20: twenty
10: ten
100: one hundred
130: one hundred and thirty
26: twenty six
000: zero
800: eight hundred
37: thirty seven
34: thirty four
90: ninety
15: fifteen
465: four hundred and sixty five
25: twenty five
000: zero
800: eight hundred
15: fifteen
3: three
50: fifty
2: two
18: eighteen
400: four hundred
415: four hundred and fifteen
30: thirty
19: nineteen
23: twenty three
400: four hundred
1706: one thousand, seven hundred and six
16: sixteen
20: twenty
200: two hundred
21: twenty one
500: five hundred
125: one hundred and twenty five
45: forty five
28: twenty eight
29: twenty nine
54: fifty four
16: sixteen
6: six
60: sixty
20: twenty
67: sixty seven
22: twenty two
60: sixty
000: zero
15: fifteen
200: two hundred
5: five
600: six hundred
15: fifteen
1933: one thousand, nine hundred and thirty three
10: ten
40: forty
90: ninety
000: zero
100: one hundred
000: zero
55: fifty five
400: four hundred
30: thirty
000: zero
246: two hundred and forty six
68: sixty eight
30: thirty
135: one hundred and thirty five
160: one hundred and sixty
175: one hundred and seventy five
000: zero
000: zero
2006: two thousand and six
57: fifty seven
185: one hundred and eighty five
125: one hundred and twenty five
205: two hundred and five
950: nine hundred and fifty
60: sixty
125: one hundred and twenty five
450: four hundred and fifty
385: three hundred and eighty five
5: five
7: seven
3: three
31: thirty one
80: eighty
69: sixty nine
8: eight
000: zero
350: three hundred and fifty
400: four hundred
550: five hundred and fifty
600: six hundred
650: six hundred and fifty
000: zero
000: zero
165: one hundred and sixty five
15: fifteen
90: ninety
20: twenty
10: ten
85: eighty five
100: one hundred
2: two
120: one hundred and twenty
13: thirteen
260: two hundred and sixty
27: twenty seven
61: sixty one
300: three hundred
165: one hundred and sixty five
175: one hundred and seventy five
50: fifty
225: two hundred and twenty five
35: thirty five
35: thirty five
42: forty two
500: five hundred
70: seventy
700: seven hundred
245: two hundred and forty five
90: ninety
120: one hundred and twenty
200: two hundred
65: sixty five
50: fifty
2: two
60: sixty
8: eight
425: four hundred and twenty five
225: two hundred and twenty five
1: one
200: two hundred
99: ninety nine
50: fifty
000: zero
400: four hundred
500: five hundred
100: one hundred
300: three hundred
500: five hundred
99: ninety nine
3: three
1: one
35: thirty five
270: two hundred and seventy
32: thirty two
32: thirty two
30: thirty
20: twenty
130: one hundred and thirty
300: three hundred
23: twenty three
12: twelve
22: twenty two
20: twenty
22: twenty two
29: twenty nine
000: zero
60: sixty
10: ten
15: fifteen
239: two hundred and thirty nine
14: fourteen
250: two hundred and fifty
16: sixteen
350: three hundred and fifty
370: three hundred and seventy
350: three hundred and fifty
000: zero
99: ninety nine
53: fifty three
5: five
75: seventy five
40: forty
40: forty
30: thirty
2017: two thousand and seventeen
2: two
16: sixteen
18: eighteen
5: five
10: ten
20: twenty
6: six
15: fifteen
20: twenty
000: zero
1918: one thousand, nine hundred and eighteen
1923: one thousand, nine hundred and twenty three
53: fifty three
55: fifty five
130: one hundred and thirty
12: twelve
69: sixty nine
160: one hundred and sixty
36: thirty six
200: two hundred
100: one hundred
150: one hundred and fifty
95: ninety five
9: nine
155: one hundred and fifty five
10: ten
25: twenty five
30: thirty
235: two hundred and thirty five
125: one hundred and twenty five
110: one hundred and ten
1981: one thousand, nine hundred and eighty one
12: twelve
50: fifty
10: ten
1: one
1967: one thousand, nine hundred and sixty seven
90: ninety
400: four hundred
40: forty
4: four
000: zero
50: fifty
30: thirty
100: one hundred
25: twenty five
8: eight
1745: one thousand, seven hundred and forty five
000: zero
8: eight
5: five
2016: two thousand and sixteen
10: ten
1: one
15: fifteen
80: eighty
35: thirty five
20: twenty
100: one hundred
50: fifty
40: forty
10: ten
10: ten
620: six hundred and twenty
12: twelve
350: three hundred and fifty
500: five hundred
14: fourteen
10: ten
11: eleven
11: eleven
10: ten
647: six hundred and forty seven
101: one hundred and one
30: thirty
200: two hundred
200: two hundred
300: three hundred
140: one hundred and forty
1918: one thousand, nine hundred and eighteen
1961: one thousand, nine hundred and sixty one
1940: one thousand, nine hundred and forty
18: eighteen
250: two hundred and fifty
165: one hundred and sixty five
85: eighty five
42: forty two
820: eight hundred and twenty
44: forty four
250: two hundred and fifty
1759: one thousand, seven hundred and fifty nine
100: one hundred
15: fifteen
60: sixty
20: twenty
30: thirty
23: twenty three
13: thirteen
7: seven
000: zero
000: zero
9000: nine thousand
966: nine hundred and sixty six
100: one hundred
55: fifty five
15: fifteen
500: five hundred
14: fourteen
65: sixty five
96: ninety six
5: five
20: twenty
35: thirty five
200: two hundred
30: thirty
1940: one thousand, nine hundred and forty
15: fifteen
18: eighteen
5: five
30: thirty
20: twenty
100: one hundred
14: fourteen
95: ninety five
4: four
2: two
1960: one thousand, nine hundred and sixty
1963: one thousand, nine hundred and sixty three
57: fifty seven
900: nine hundred
6: six
90: ninety
40: forty
000: zero
2: two
30: thirty
000: zero
2009: two thousand and nine
10: ten
10: ten
40: forty
60: sixty
25: twenty five
35: thirty five
78: seventy eight
1826: one thousand, eight hundred and twenty six
12: twelve
6: six
25: twenty five
27: twenty seven
1: one
300: three hundred
400: four hundred
100: one hundred
60: sixty
425: four hundred and twenty five
000: zero
10: ten
13: thirteen
425: four hundred and twenty five
6: six
100: one hundred
70: seventy
6: six
100: one hundred
17: seventeen
100: one hundred
120: one hundred and twenty
50: fifty
5: five
12: twelve
75: seventy five
10: ten
209: two hundred and nine
50: fifty
260: two hundred and sixty
260: two hundred and sixty
160: one hundred and sixty
14: fourteen
50: fifty
26: twenty six
18: eighteen
8: eight
70: seventy
20: twenty
500: five hundred
1: one
3: three
12: twelve
1878: one thousand, eight hundred and seventy eight
30: thirty
40: forty
6: six
2015: two thousand and fifteen
7: seven
12: twelve
1977: one thousand, nine hundred and seventy seven
30: thirty
1954: one thousand, nine hundred and fifty four
20: twenty
12: twelve
2015: two thousand and fifteen
2: two
54: fifty four
10: ten
24: twenty four
300: three hundred
218: two hundred and eighteen
35: thirty five
1951: one thousand, nine hundred and fifty one
20: twenty
90: ninety
90: ninety
15: fifteen
854: eight hundred and fifty four
1985: one thousand, nine hundred and eighty five
20: twenty
000: zero
30: thirty
3: three
3: three
5: five
49: forty nine
350: three hundred and fifty
100: one hundred
200: two hundred
105: one hundred and five
12: twelve
1: one
5: five
400: four hundred
5: five
1: one
2: two
000: zero
2011: two thousand and eleven
1911: one thousand, nine hundred and eleven
1967: one thousand, nine hundred and sixty seven
15: fifteen
11: eleven
2016: two thousand and sixteen
646: six hundred and forty six
2: two
2: two
1973: one thousand, nine hundred and seventy three
65: sixty five
100: one hundred
150: one hundred and fifty
600: six hundred
400: four hundred
500: five hundred
1994: one thousand, nine hundred and ninety four
17: seventeen
30: thirty
15: fifteen
200: two hundred
300: three hundred
15: fifteen
2016: two thousand and sixteen
50: fifty
50: fifty
100: one hundred
520: five hundred and twenty
150: one hundred and fifty
2300: two thousand, three hundred
24: twenty four
15: fifteen
40: forty
10: ten
200: two hundred
1: one
700: seven hundred
200: two hundred
18: eighteen
15: fifteen
20: twenty
14: fourteen
69: sixty nine
3: three
200: two hundred
25: twenty five
930: nine hundred and thirty
10: ten
2016: two thousand and sixteen
2016: two thousand and sixteen
2016: two thousand and sixteen
180: one hundred and eighty
13: thirteen
7: seven
1852: one thousand, eight hundred and fifty two
1: one
35: thirty five
150: one hundred and fifty
42: forty two
2: two
100: one hundred
70: seventy
100: one hundred
20: twenty
30: thirty
4: four
60: sixty
100: one hundred
100: one hundred
700: seven hundred
20: twenty
000: zero
70: seventy
1: one
20: twenty
65: sixty five
70: seventy
20: twenty
20: twenty
100: one hundred
000: zero
30: thirty
110: one hundred and ten
22: twenty two
24: twenty four
120: one hundred and twenty
10: ten
12: twelve
14: fourteen
5: five
31: thirty one
12: twelve
7: seven
9: nine
2: two
74: seventy four
1: one
2009: two thousand and nine
40: forty
1969: one thousand, nine hundred and sixty nine
1998: one thousand, nine hundred and ninety eight
14: fourteen
1973: one thousand, nine hundred and seventy three
000: zero
100: one hundred
25: twenty five
20: twenty
48: forty eight
17: seventeen
24: twenty four
4: four
000: zero
2: two
55: fifty five
4: four
24: twenty four
28: twenty eight
27: twenty seven
27: twenty seven
30: thirty
30: thirty
12: twelve
600: six hundred
500: five hundred
1835: one thousand, eight hundred and thirty five
22: twenty two
20: twenty
45: forty five
60: sixty
89: eighty nine
40: forty
0: zero
12: twelve
10: ten
1826: one thousand, eight hundred and twenty six
000: zero
5: five
1848: one thousand, eight hundred and forty eight
14: fourteen
16: sixteen
11: eleven
800: eight hundred
300: three hundred
000: zero
100: one hundred
600: six hundred
125: one hundred and twenty five
20: twenty
85: eighty five
40: forty
8: eight
18: eighteen
73: seventy three
40: forty
2: two
1948: one thousand, nine hundred and forty eight
80: eighty
500: five hundred
200: two hundred
400: four hundred
1948: one thousand, nine hundred and forty eight
20: twenty
1790: one thousand, seven hundred and ninety
1984: one thousand, nine hundred and eighty four
2: two
000: zero
400: four hundred
150: one hundred and fifty
15: fifteen
25: twenty five
000: zero
1: one
000: zero
47: forty seven
22: twenty two
5: five
40: forty
100: one hundred
40: forty
10: ten
2: two
1: one
49: forty nine
100: one hundred
350: three hundred and fifty
155: one hundred and fifty five
90: ninety
95: ninety five
000: zero
325: three hundred and twenty five
320: three hundred and twenty
325: three hundred and twenty five
10: ten
20: twenty
15: fifteen
30: thirty
80: eighty
60: sixty
15: fifteen
1: one
10: ten
20: twenty
2: two
1848: one thousand, eight hundred and forty eight
1842: one thousand, eight hundred and forty two
90: ninety
25: twenty five
30: thirty
10: ten
Some additional insights:
Average sentence length: 7.231702262265756
Max sentence length: 28
Min sentence length: 3
Unique words: 17388
Shape: (45839, 1)
sentence
0 WHEN YOU'RE COOKING CHIPS AT HOME
1 THE TRADITIONAL CHIP PAN OFTEN STAYS ON THE SHELF
2 THROUGH WHAT THEY CALL A KNIFE BLOCK
3 WHICH INVOLVES FIRING A POTATO DOWN A PIPE
4 APART FROM THE GOLDEN COLOUR AND THE DELICIOUS...
In [ ]:
# Define a color palette
palette = {
"histogram": "#2980B9",
"bar1": "#3498DB",
"bar2": "#E74C3C",
"bar3": "#1ABC9C"
}
# 1. Distribution of Sentence Lengths
plt.figure(figsize=(12, 6))
sentence_lengths = df['sentence'].str.split().apply(len)
sns.histplot(sentence_lengths, bins=30, color=palette["histogram"], edgecolor='black', alpha=0.7)
plt.title('Distribution of Sentence Lengths', fontsize=15)
plt.xlabel('Sentence Length (words)', fontsize=12)
plt.ylabel('Frequency', fontsize=12)
plt.grid(axis='y', linestyle='--', alpha=0.5)
plt.tight_layout()
plt.show()
# 2. Proportion of Sentences with Numbers Converted vs. Total Sentences
plt.figure(figsize=(10, 6))
labels = ['Sentences with Numbers Converted', 'Other Sentences']
values = [len(numbers_converted), len(df) - len(numbers_converted)]
bars = plt.bar(labels, values, color=[palette["bar1"], palette["bar2"]])
for bar in bars:
yval = bar.get_height()
plt.text(bar.get_x() + bar.get_width()/2, yval + 100,
f'{yval} ({yval/len(df)*100:.1f}%)',
ha='center', va='bottom', fontweight='bold')
plt.title('Proportion of Sentences with Numbers Converted')
plt.ylabel('Count')
plt.tight_layout()
plt.show()
# 3. Top 10 Most Frequently Converted Numbers
num_freq = Counter([num for num, word in numbers_converted])
common_nums = num_freq.most_common(10)
nums, counts = zip(*common_nums)
plt.figure(figsize=(12, 7))
bars = plt.barh(nums, counts, color=palette["bar3"])
plt.gca().invert_yaxis() # To display the most frequent number at the top
for bar in bars:
plt.text(bar.get_width() - (0.02 * max(counts)), bar.get_y() + bar.get_height()/2,
str(int(bar.get_width())), va='center', ha='right', color='white', fontweight='bold')
plt.title('Top 10 Most Frequently Converted Numbers', fontsize=15)
plt.xlabel('Frequency', fontsize=12)
plt.ylabel('Number', fontsize=12)
plt.tight_layout()
plt.show()
In [ ]:
sentences = df['sentence'].tolist()
# Calculate Unique Word Count
unique_words = set(word for sentence in sentences for word in sentence.split())
print(f"Number of unique words: {len(unique_words)}")
# Initial Letters Distribution
initial_letters = [word[0].lower() for sentence in sentences for word in sentence.split()]
initial_letter_freq = Counter(initial_letters)
# Vowel and Consonant Distribution
vowels = set("aeiou")
num_vowels = sum(1 for word in ''.join(sentences).lower() if word in vowels)
num_consonants = sum(1 for word in ''.join(sentences).lower() if word.isalpha() and word not in vowels)
print(f"\nThe number of vowels are: {num_vowels}")
print(f"The number of consonants are: {num_consonants}")
Number of unique words: 17388 The number of vowels are: 534404 The number of consonants are: 858424
In [ ]:
# Calculate sentence lengths
sentence_lengths = [len(nltk.word_tokenize(line)) for line in df['sentence']]
# Statistics
average_length = np.mean(sentence_lengths)
shortest_length = np.min(sentence_lengths)
longest_length = np.max(sentence_lengths)
# Print statistics
print("Average sentence length:", average_length)
print("Median sentence length:", np.median(sentence_lengths))
print("Standard deviation of sentence length:", np.std(sentence_lengths, ddof=1))
print("Minimum sentence length:", shortest_length)
print("Maximum sentence length:", longest_length)
# Histogram for Sentence Lengths Distribution
plt.figure(figsize=(10, 6))
plt.hist(sentence_lengths, bins=30, edgecolor='k', alpha=0.7, color="#3498DB")
plt.title('Sentence Lengths Distribution')
plt.xlabel('Sentence Length')
plt.ylabel('Number of Sentences')
plt.show()
# Bar plot for Average, Shortest, and Longest sentence lengths
plt.figure(figsize=(10, 6))
sentence_labels = ['Average', 'Shortest', 'Longest']
lengths = [average_length, shortest_length, longest_length]
sns.barplot(x=sentence_labels, y=lengths, palette="Blues_d")
plt.title('Sentence Lengths Overview')
plt.ylabel('Number of Words')
plt.show()
# Unique Word Count Visualization
plt.figure(figsize=(5, 6))
sns.barplot(x=['Unique Words'], y=[len(unique_words)], palette="Purples_d")
plt.title('Unique Word Count')
plt.show()
# Vowel vs. Consonant Distribution Visualization
plt.figure(figsize=(8, 8))
labels = ['Vowels', 'Consonants']
sizes = [num_vowels, num_consonants]
colors = ['#ff9999','#66b2b2']
plt.pie(sizes, labels=labels, colors=colors, autopct='%1.1f%%', startangle=90)
plt.title('Vowel vs. Consonant Distribution')
plt.axis('equal') # Equal aspect ratio ensures pie is drawn as a circle.
plt.show()
# Initial Letter Distribution Visualization
plt.figure(figsize=(14, 8))
letters, counts = zip(*initial_letter_freq.most_common())
sns.barplot(x=list(letters), y=list(counts), palette="viridis")
plt.title('Initial Letter Distribution')
plt.xlabel('Initial Letter')
plt.ylabel('Count')
plt.show()
# Number-to-Word Conversions Table
print("\nNumbers converted to words:")
for number, words in numbers_converted:
print(f"{number}: {words.replace('-', ' ')}")
# Insights from DataFrame Visualization
plt.figure(figsize=(10, 6))
df_lengths = [df['sentence'].str.split().apply(len).mean(), df['sentence'].str.split().apply(len).min(), df['sentence'].str.split().apply(len).max()]
df_labels = ['Average', 'Shortest', 'Longest']
sns.barplot(x=df_labels, y=df_lengths, palette="Greens_d")
plt.title('Sentence Lengths from DataFrame')
plt.ylabel('Number of Words')
plt.show()
Average sentence length: 7.540958572394686 Median sentence length: 6.0 Standard deviation of sentence length: 3.871960425939225 Minimum sentence length: 3 Maximum sentence length: 30
Numbers converted to words: 4: four 24: twenty four 1972: one thousand, nine hundred and seventy two 1: one 20: twenty 2: two 2012: two thousand and twelve 110: one hundred and ten 6: six 14: fourteen 31: thirty one 1964: one thousand, nine hundred and sixty four 1965: one thousand, nine hundred and sixty five 69: sixty nine 1: one 1966: one thousand, nine hundred and sixty six 67: sixty seven 1: one 10: ten 230: two hundred and thirty 1968: one thousand, nine hundred and sixty eight 1969: one thousand, nine hundred and sixty nine 80: eighty 17: seventeen 50: fifty 60: sixty 200: two hundred 180: one hundred and eighty 10: ten 20: twenty 195: one hundred and ninety five 249: two hundred and forty nine 300: three hundred 500: five hundred 350: three hundred and fifty 25: twenty five 20: twenty 65: sixty five 10: ten 2005: two thousand and five 000: zero 50: fifty 1: one 40: forty 60: sixty 23: twenty three 5: five 10: ten 10: ten 000: zero 180: one hundred and eighty 54: fifty four 1: one 300: three hundred 45: forty five 50: fifty 5: five 20: twenty 80: eighty 10: ten 100: one hundred 3: three 10: ten 21: twenty one 21: twenty one 400: four hundred 5: five 40: forty 35: thirty five 5: five 1: one 15: fifteen 250: two hundred and fifty 15: fifteen 25: twenty five 27: twenty seven 60: sixty 4: four 500: five hundred 6: six 6: six 20: twenty 68: sixty eight 7: seven 24: twenty four 2: two 3: three 2000: two thousand 40: forty 39: thirty nine 1940: one thousand, nine hundred and forty 75: seventy five 000: zero 100: one hundred 1: one 800: eight hundred 108: one hundred and eight 012: twelve 2012: two thousand and twelve 20: twenty 10: ten 85: eighty five 31: thirty one 300: three hundred 9: nine 3: three 150: one hundred and fifty 20: twenty 500: five hundred 000: zero 100: one hundred 1734: one thousand, seven hundred and thirty four 147: one hundred and forty seven 85: eighty five 55: fifty five 14: fourteen 15: fifteen 80: eighty 000: zero 90: ninety 200: two hundred 25: twenty five 300: three hundred 75: seventy five 10: ten 5: five 22: twenty two 150: one hundred and fifty 45: forty five 2001: two thousand and one 10: ten 6: six 300: three hundred 12: twelve 185: one hundred and eighty five 80: eighty 14: fourteen 30: thirty 40: forty 29: twenty nine 14: fourteen 300: three hundred 1: one 810: eight hundred and ten 18: eighteen 18: eighteen 40: forty 60: sixty 35: thirty five 000: zero 000: zero 9: nine 15: fifteen 17: seventeen 12: twelve 25: twenty five 30: thirty 21: twenty one 300: three hundred 10: ten 2009: two thousand and nine 45: forty five 30: thirty 59: fifty nine 7: seven 24: twenty four 5: five 000: zero 30: thirty 86: eighty six 32: thirty two 10: ten 5: five 70: seventy 100: one hundred 50: fifty 28: twenty eight 65: sixty five 45: forty five 30: thirty 40: forty 000: zero 1642: one thousand, six hundred and forty two 65: sixty five 12: twelve 1527: one thousand, five hundred and twenty seven 150: one hundred and fifty 250: two hundred and fifty 60: sixty 1900: one thousand, nine hundred 140: one hundred and forty 10: ten 12: twelve 13: thirteen 35: thirty five 12: twelve 40: forty 50: fifty 50: fifty 50: fifty 9: nine 5: five 1: one 125: one hundred and twenty five 5: five 28: twenty eight 24: twenty four 000: zero 18: eighteen 78: seventy eight 100: one hundred 10: ten 100: one hundred 18: eighteen 25: twenty five 1979: one thousand, nine hundred and seventy nine 33: thirty three 100: one hundred 100: one hundred 50: fifty 80: eighty 20: twenty 30: thirty 1: one 1709: one thousand, seven hundred and nine 1710: one thousand, seven hundred and ten 30: thirty 50: fifty 400: four hundred 218: two hundred and eighteen 97: ninety seven 24: twenty four 28: twenty eight 4: four 520: five hundred and twenty 32: thirty two 2013: two thousand and thirteen 2013: two thousand and thirteen 10: ten 160: one hundred and sixty 250: two hundred and fifty 2012: two thousand and twelve 78: seventy eight 2010: two thousand and ten 1980: one thousand, nine hundred and eighty 600: six hundred 500: five hundred 1: one 000: zero 30: thirty 400: four hundred 10: ten 200: two hundred 27: twenty seven 5: five 14: fourteen 90: ninety 25: twenty five 20: twenty 41: forty one 25: twenty five 100: one hundred 150: one hundred and fifty 20: twenty 30: thirty 25: twenty five 60: sixty 41: forty one 40: forty 47: forty seven 20: twenty 50: fifty 60: sixty 50: fifty 400: four hundred 1: one 20: twenty 140: one hundred and forty 1: one 80: eighty 20: twenty 95: ninety five 18: eighteen 500: five hundred 40: forty 50: fifty 500: five hundred 1: one 2015: two thousand and fifteen 100: one hundred 2: two 190: one hundred and ninety 400: four hundred 5: five 10: ten 50: fifty 5: five 36: thirty six 000: zero 67: sixty seven 100: one hundred 300: three hundred 125: one hundred and twenty five 30: thirty 11: eleven 16: sixteen 24: twenty four 2: two 4: four 3: three 30: thirty 2010: two thousand and ten 2010: two thousand and ten 2011: two thousand and eleven 1: one 20: twenty 40: forty 40: forty 8: eight 30: thirty 1947: one thousand, nine hundred and forty seven 60: sixty 5: five 60: sixty 12: twelve 12: twelve 1: one 12: twelve 1: one 12: twelve 2: two 2: two 12: twelve 12: twelve 12: twelve 2: two 100: one hundred 50: fifty 100: one hundred 1: one 12: twelve 000: zero 12: twelve 6: six 12: twelve 2000: two thousand 12: twelve 999: nine hundred and ninety nine 14: fourteen 12: twelve 000: zero 4: four 12: twelve 1: one 12: twelve 12: twelve 14: fourteen 12: twelve 000: zero 500: five hundred 500: five hundred 000: zero 10: ten 12: twelve 1: one 12: twelve 12: twelve 300: three hundred 500: five hundred 7: seven 000: zero 12: twelve 12: twelve 2: two 12: twelve 2011: two thousand and eleven 000: zero 14: fourteen 000: zero 63: sixty three 64: sixty four 180: one hundred and eighty 30: thirty 11: eleven 35: thirty five 8: eight 600: six hundred 95: ninety five 000: zero 450: four hundred and fifty 475: four hundred and seventy five 5: five 37: thirty seven 31: thirty one 35: thirty five 65: sixty five 000: zero 20: twenty 1835: one thousand, eight hundred and thirty five 54: fifty four 67: sixty seven 000: zero 000: zero 32: thirty two 000: zero 30: thirty 40: forty 58: fifty eight 000: zero 960: nine hundred and sixty 24: twenty four 27: twenty seven 000: zero 32: thirty two 89: eighty nine 70: seventy 48: forty eight 000: zero 62: sixty two 58: fifty eight 000: zero 12: twelve 69: sixty nine 160: one hundred and sixty 145: one hundred and forty five 500: five hundred 100: one hundred 93: ninety three 500: five hundred 30: thirty 50: fifty 1: one 13: thirteen 200: two hundred 000: zero 6: six 600: six hundred 63: sixty three 400: four hundred 60: sixty 1: one 1: one 000: zero 1: one 1: one 300: three hundred 25: twenty five 000: zero 24: twenty four 90: ninety 110: one hundred and ten 1935: one thousand, nine hundred and thirty five 30: thirty 50: fifty 800: eight hundred 100: one hundred 150: one hundred and fifty 250: two hundred and fifty 22: twenty two 200: two hundred 35: thirty five 2012: two thousand and twelve 1851: one thousand, eight hundred and fifty one 10: ten 24: twenty four 12: twelve 1943: one thousand, nine hundred and forty three 60: sixty 617: six hundred and seventeen 1533: one thousand, five hundred and thirty three 200: two hundred 1537: one thousand, five hundred and thirty seven 56: fifty six 26: twenty six 14: fourteen 2012: two thousand and twelve 20: twenty 30: thirty 21: twenty one 99: ninety nine 15: fifteen 12: twelve 900: nine hundred 1: one 1940: one thousand, nine hundred and forty 53: fifty three 400: four hundred 40: forty 1: one 2: two 52: fifty two 28: twenty eight 25: twenty five 48: forty eight 2008: two thousand and eight 60: sixty 75: seventy five 80: eighty 8: eight 5: five 100: one hundred 62: sixty two 12: twelve 11: eleven 371: three hundred and seventy one 371: three hundred and seventy one 000: zero 11: eleven 80: eighty 23: twenty three 40: forty 000: zero 6: six 2003: two thousand and three 93: ninety three 6: six 20: twenty 2003: two thousand and three 205: two hundred and five 69: sixty nine 33: thirty three 000: zero 59: fifty nine 56: fifty six 000: zero 40: forty 15: fifteen 70: seventy 50: fifty 60: sixty 29: twenty nine 45: forty five 80: eighty 100: one hundred 40: forty 8: eight 30: thirty 1: one 12: twelve 000: zero 55: fifty five 000: zero 10: ten 135: one hundred and thirty five 160: one hundred and sixty 110: one hundred and ten 90: ninety 15: fifteen 150: one hundred and fifty 16: sixteen 170: one hundred and seventy 20: twenty 54: fifty four 195: one hundred and ninety five 10: ten 450: four hundred and fifty 20: twenty 450: four hundred and fifty 400: four hundred 1: one 12: twelve 8: eight 200: two hundred 2: two 1: one 2010: two thousand and ten 25: twenty five 46: forty six 000: zero 62: sixty two 59: fifty nine 1995: one thousand, nine hundred and ninety five 000: zero 51: fifty one 56: fifty six 000: zero 000: zero 24: twenty four 66: sixty six 36: thirty six 57: fifty seven 000: zero 200: two hundred 1: one 76: seventy six 40: forty 20: twenty 16: sixteen 2012: two thousand and twelve 25: twenty five 50: fifty 14: fourteen 2013: two thousand and thirteen 61: sixty one 22: twenty two 11: eleven 66: sixty six 67: sixty seven 1846: one thousand, eight hundred and forty six 75: seventy five 120: one hundred and twenty 75: seventy five 5: five 5: five 11: eleven 3: three 50: fifty 2013: two thousand and thirteen 20: twenty 40: forty 3: three 35: thirty five 24: twenty four 12: twelve 10: ten 18: eighteen 18: eighteen 65: sixty five 100: one hundred 50: fifty 57: fifty seven 300: three hundred 50: fifty 8: eight 75: seventy five 30: thirty 50: fifty 28: twenty eight 10: ten 20: twenty 400: four hundred 500: five hundred 150: one hundred and fifty 7: seven 18: eighteen 26: twenty six 1984: one thousand, nine hundred and eighty four 462: four hundred and sixty two 2014: two thousand and fourteen 72: seventy two 1820: one thousand, eight hundred and twenty 2: two 1754: one thousand, seven hundred and fifty four 10: ten 10: ten 60: sixty 75: seventy five 94: ninety four 50: fifty 100: one hundred 370: three hundred and seventy 40: forty 50: fifty 51: fifty one 200: two hundred 1: one 70: seventy 100: one hundred 200: two hundred 1893: one thousand, eight hundred and ninety three 1991: one thousand, nine hundred and ninety one 100: one hundred 25: twenty five 100: one hundred 30: thirty 10: ten 1: one 200: two hundred 300: three hundred 4: four 120: one hundred and twenty 180: one hundred and eighty 230: two hundred and thirty 1946: one thousand, nine hundred and forty six 20: twenty 200: two hundred 400: four hundred 200: two hundred 500: five hundred 800: eight hundred 10: ten 20: twenty 200: two hundred 70: seventy 10: ten 30: thirty 10: ten 2: two 150: one hundred and fifty 31: thirty one 30: thirty 50: fifty 20: twenty 30: thirty 40: forty 10: ten 500: five hundred 12: twelve 30: thirty 25: twenty five 30: thirty 139: one hundred and thirty nine 25: twenty five 10: ten 20: twenty 638: six hundred and thirty eight 13: thirteen 10: ten 12: twelve 12: twelve 28: twenty eight 35: thirty five 60: sixty 100: one hundred 5: five 40: forty 12: twelve 25: twenty five 25: twenty five 9: nine 5: five 15: fifteen 25: twenty five 000: zero 150: one hundred and fifty 100: one hundred 150: one hundred and fifty 100: one hundred 150: one hundred and fifty 30: thirty 45: forty five 930: nine hundred and thirty 9: nine 4: four 1: one 48: forty eight 400: four hundred 2: two 707: seven hundred and seven 707: seven hundred and seven 50: fifty 2: two 50: fifty 2: two 350: three hundred and fifty 14: fourteen 24: twenty four 25: twenty five 30: thirty 12: twelve 50: fifty 2: two 975: nine hundred and seventy five 300: three hundred 200: two hundred 2: two 600: six hundred 000: zero 5: five 12: twelve 45: forty five 120: one hundred and twenty 800: eight hundred 1: one 38: thirty eight 1: one 2012: two thousand and twelve 1: one 30: thirty 50: fifty 60: sixty 5: five 300: three hundred 200: two hundred 1947: one thousand, nine hundred and forty seven 52: fifty two 90: ninety 2000: two thousand 40: forty 100: one hundred 000: zero 95: ninety five 40: forty 14: fourteen 2: two 79: seventy nine 45: forty five 100: one hundred 500: five hundred 600: six hundred 100: one hundred 2: two 194: one hundred and ninety four 15: fifteen 1779: one thousand, seven hundred and seventy nine 60: sixty 25: twenty five 200: two hundred 250: two hundred and fifty 30: thirty 600: six hundred 100: one hundred 30: thirty 50: fifty 80: eighty 65: sixty five 25: twenty five 20: twenty 180: one hundred and eighty 250: two hundred and fifty 50: fifty 5: five 000: zero 10: ten 200: two hundred 1800: one thousand, eight hundred 1910: one thousand, nine hundred and ten 10: ten 12: twelve 75: seventy five 100: one hundred 15: fifteen 70: seventy 60: sixty 99: ninety nine 45: forty five 5: five 000: zero 40: forty 60: sixty 12: twelve 15: fifteen 400: four hundred 600: six hundred 1: one 853: eight hundred and fifty three 46: forty six 500: five hundred 25: twenty five 1888: one thousand, eight hundred and eighty eight 250: two hundred and fifty 10: ten 5: five 1958: one thousand, nine hundred and fifty eight 85: eighty five 600: six hundred 800: eight hundred 2006: two thousand and six 15: fifteen 200: two hundred 300: three hundred 38: thirty eight 28: twenty eight 100: one hundred 60: sixty 300: three hundred 300: three hundred 75: seventy five 1899: one thousand, eight hundred and ninety nine 300: three hundred 2: two 200: two hundred 21: twenty one 100: one hundred 40: forty 125: one hundred and twenty five 24: twenty four 50: fifty 23: twenty three 13: thirteen 10: ten 120: one hundred and twenty 50: fifty 38: thirty eight 60: sixty 30: thirty 150: one hundred and fifty 12: twelve 2: two 32: thirty two 2012: two thousand and twelve 800: eight hundred 1907: one thousand, nine hundred and seven 50: fifty 50: fifty 35: thirty five 25: twenty five 1: one 90: ninety 450: four hundred and fifty 900: nine hundred 400: four hundred 500: five hundred 750: seven hundred and fifty 58: fifty eight 370: three hundred and seventy 42: forty two 10: ten 150: one hundred and fifty 50: fifty 1901: one thousand, nine hundred and one 5: five 10: ten 30: thirty 20: twenty 4: four 1985: one thousand, nine hundred and eighty five 175: one hundred and seventy five 12: twelve 11: eleven 60: sixty 12: twelve 100: one hundred 100: one hundred 500: five hundred 11: eleven 100: one hundred 300: three hundred 50: fifty 48: forty eight 30: thirty 1924: one thousand, nine hundred and twenty four 160: one hundred and sixty 80: eighty 24: twenty four 15: fifteen 600: six hundred 2: two 1: one 20: twenty 40: forty 50: fifty 15: fifteen 10: ten 10: ten 25: twenty five 70: seventy 20: twenty 100: one hundred 200: two hundred 15: fifteen 34: thirty four 20: twenty 20: twenty 26: twenty six 3: three 32: thirty two 20: twenty 5: five 60: sixty 800: eight hundred 100: one hundred 20: twenty 65: sixty five 50: fifty 20: twenty 10: ten 150: one hundred and fifty 2: two 63: sixty three 33: thirty three 100: one hundred 000: zero 000: zero 500: five hundred 90: ninety 50: fifty 20: twenty 1900: one thousand, nine hundred 10: ten 200: two hundred 17: seventeen 30: thirty 24: twenty four 120: one hundred and twenty 100: one hundred 100: one hundred 1: one 120: one hundred and twenty 27: twenty seven 1934: one thousand, nine hundred and thirty four 673: six hundred and seventy three 29: twenty nine 30: thirty 6: six 600: six hundred 200: two hundred 62: sixty two 100: one hundred 3: three 180: one hundred and eighty 142: one hundred and forty two 100: one hundred 1958: one thousand, nine hundred and fifty eight 25: twenty five 16: sixteen 300: three hundred 400: four hundred 000: zero 30: thirty 10: ten 12: twelve 24: twenty four 300: three hundred 40: forty 80: eighty 400: four hundred 8: eight 200: two hundred 300: three hundred 800: eight hundred 12: twelve 000: zero 20: twenty 7: seven 0: zero 40: forty 40: forty 75: seventy five 20: twenty 4: four 4: four 29: twenty nine 1770: one thousand, seven hundred and seventy 000: zero 627: six hundred and twenty seven 465: four hundred and sixty five 375: three hundred and seventy five 385: three hundred and eighty five 3: three 2011: two thousand and eleven 1942: one thousand, nine hundred and forty two 60: sixty 7: seven 18: eighteen 000: zero 2001: two thousand and one 11: eleven 8: eight 5: five 180: one hundred and eighty 30: thirty 75: seventy five 1993: one thousand, nine hundred and ninety three 1978: one thousand, nine hundred and seventy eight 18: eighteen 20: twenty 3: three 1: one 24: twenty four 20: twenty 24: twenty four 150: one hundred and fifty 15: fifteen 1850: one thousand, eight hundred and fifty 1035: one thousand and thirty five 100: one hundred 000: zero 20: twenty 1887: one thousand, eight hundred and eighty seven 26: twenty six 15: fifteen 155: one hundred and fifty five 30: thirty 240: two hundred and forty 15: fifteen 115: one hundred and fifteen 10: ten 1: one 10: ten 94: ninety four 24: twenty four 2: two 2: two 300: three hundred 450: four hundred and fifty 65: sixty five 100: one hundred 100: one hundred 195: one hundred and ninety five 300: three hundred 165: one hundred and sixty five 37: thirty seven 1: one 28: twenty eight 1814: one thousand, eight hundred and fourteen 200: two hundred 100: one hundred 75: seventy five 100: one hundred 450: four hundred and fifty 32: thirty two 20: twenty 246: two hundred and forty six 270: two hundred and seventy 400: four hundred 125: one hundred and twenty five 380: three hundred and eighty 125: one hundred and twenty five 200: two hundred 250: two hundred and fifty 21: twenty one 1: one 450: four hundred and fifty 40: forty 15: fifteen 170: one hundred and seventy 15: fifteen 700: seven hundred 9: nine 200: two hundred 400: four hundred 75: seventy five 600: six hundred 300: three hundred 170: one hundred and seventy 10: ten 2014: two thousand and fourteen 000: zero 3: three 000: zero 30: thirty 1948: one thousand, nine hundred and forty eight 15: fifteen 50: fifty 20: twenty 18: eighteen 18: eighteen 32: thirty two 100: one hundred 1: one 500: five hundred 1338: one thousand, three hundred and thirty eight 12: twelve 24: twenty four 000: zero 10: ten 1545: one thousand, five hundred and forty five 25: twenty five 2008: two thousand and eight 500: five hundred 112: one hundred and twelve 16: sixteen 35: thirty five 500: five hundred 10: ten 4: four 700: seven hundred 17: seventeen 17: seventeen 2: two 8: eight 17: seventeen 1: one 100: one hundred 2014: two thousand and fourteen 515: five hundred and fifteen 210: two hundred and ten 125: one hundred and twenty five 550: five hundred and fifty 625: six hundred and twenty five 430: four hundred and thirty 150: one hundred and fifty 15: fifteen 75: seventy five 190: one hundred and ninety 133: one hundred and thirty three 295: two hundred and ninety five 37: thirty seven 000: zero 65: sixty five 425: four hundred and twenty five 147: one hundred and forty seven 8: eight 10: ten 62: sixty two 110: one hundred and ten 15: fifteen 20: twenty 20: twenty 21: twenty one 14: fourteen 3: three 25: twenty five 30: thirty 100: one hundred 110: one hundred and ten 800: eight hundred 200: two hundred 000: zero 20: twenty 1: one 400: four hundred 12: twelve 24: twenty four 6000: six thousand 135: one hundred and thirty five 20: twenty 30: thirty 50: fifty 15: fifteen 40: forty 000: zero 1905: one thousand, nine hundred and five 100: one hundred 3: three 100: one hundred 100: one hundred 50: fifty 2014: two thousand and fourteen 1984: one thousand, nine hundred and eighty four 2014: two thousand and fourteen 14: fourteen 87: eighty seven 3: three 20: twenty 65: sixty five 170: one hundred and seventy 11: eleven 2001: two thousand and one 96: ninety six 20: twenty 1945: one thousand, nine hundred and forty five 1965: one thousand, nine hundred and sixty five 1739: one thousand, seven hundred and thirty nine 28: twenty eight 161: one hundred and sixty one 35: thirty five 000: zero 14: fourteen 70: seventy 10: ten 20: twenty 40: forty 1606: one thousand, six hundred and six 10: ten 20: twenty 25: twenty five 300: three hundred 500: five hundred 200: two hundred 1996: one thousand, nine hundred and ninety six 95: ninety five 18: eighteen 59: fifty nine 23: twenty three 18: eighteen 12: twelve 21: twenty one 1976: one thousand, nine hundred and seventy six 25: twenty five 9: nine 458: four hundred and fifty eight 40: forty 11: eleven 15: fifteen 11: eleven 75: seventy five 15: fifteen 15: fifteen 20: twenty 1: one 10: ten 20: twenty 40: forty 85: eighty five 35: thirty five 5: five 1949: one thousand, nine hundred and forty nine 20: twenty 40: forty 11: eleven 70: seventy 30: thirty 000: zero 50: fifty 000: zero 65: sixty five 2016: two thousand and sixteen 28: twenty eight 100: one hundred 150: one hundred and fifty 2050: two thousand and fifty 100: one hundred 1963: one thousand, nine hundred and sixty three 20: twenty 80: eighty 100: one hundred 20: twenty 17: seventeen 000: zero 300: three hundred 46: forty six 700: seven hundred 1527: one thousand, five hundred and twenty seven 82: eighty two 60: sixty 18: eighteen 20: twenty 55: fifty five 1958: one thousand, nine hundred and fifty eight 16: sixteen 13: thirteen 12: twelve 000: zero 100: one hundred 14: fourteen 30: thirty 155: one hundred and fifty five 50: fifty 22: twenty two 000: zero 000: zero 40: forty 100: one hundred 150: one hundred and fifty 52: fifty two 11: eleven 24: twenty four 300: three hundred 400: four hundred 30: thirty 40: forty 30: thirty 27: twenty seven 46: forty six 300: three hundred 10: ten 4: four 7: seven 40: forty 5: five 5: five 80: eighty 80: eighty 150: one hundred and fifty 76: seventy six 20: twenty 2013: two thousand and thirteen 40: forty 20: twenty 930: nine hundred and thirty 1800: one thousand, eight hundred 680: six hundred and eighty 500: five hundred 80: eighty 90: ninety 95: ninety five 50: fifty 11: eleven 24: twenty four 300: three hundred 3: three 000: zero 89: eighty nine 20: twenty 40: forty 5: five 191: one hundred and ninety one 17: seventeen 1963: one thousand, nine hundred and sixty three 40: forty 60: sixty 40: forty 50: fifty 4: four 96: ninety six 0: zero 150: one hundred and fifty 180: one hundred and eighty 5: five 200: two hundred 300: three hundred 26: twenty six 11: eleven 15: fifteen 20: twenty 200: two hundred 120: one hundred and twenty 1: one 20: twenty 90: ninety 100: one hundred 200: two hundred 300: three hundred 100: one hundred 10: ten 20: twenty 82: eighty two 20: twenty 200: two hundred 100: one hundred 200: two hundred 100: one hundred 30: thirty 60: sixty 200: two hundred 200: two hundred 3: three 97: ninety seven 5: five 1: one 5: five 85: eighty five 40: forty 15: fifteen 20: twenty 2: two 4: four 2: two 500: five hundred 20: twenty 20: twenty 25: twenty five 30: thirty 27: twenty seven 000: zero 50: fifty 12: twelve 1: one 12: twelve 1: one 12: twelve 30: thirty 1: one 2: two 000: zero 520: five hundred and twenty 30: thirty 18: eighteen 18: eighteen 16: sixteen 18: eighteen 15: fifteen 20: twenty 7: seven 460: four hundred and sixty 1: one 30: thirty 3: three 2: two 12: twelve 2022: two thousand and twenty two 63: sixty three 3: three 15: fifteen 10: ten 2010: two thousand and ten 1923: one thousand, nine hundred and twenty three 11: eleven 25: twenty five 21: twenty one 100: one hundred 140: one hundred and forty 150: one hundred and fifty 200: two hundred 90: ninety 75: seventy five 20: twenty 16: sixteen 1723: one thousand, seven hundred and twenty three 150: one hundred and fifty 100: one hundred 200: two hundred 50: fifty 100: one hundred 90: ninety 200: two hundred 300: three hundred 30: thirty 24: twenty four 5: five 000: zero 000: zero 1: one 70: seventy 2: two 84: eighty four 1988: one thousand, nine hundred and eighty eight 15: fifteen 150: one hundred and fifty 29: twenty nine 38: thirty eight 1991: one thousand, nine hundred and ninety one 40: forty 55: fifty five 15: fifteen 20: twenty 47: forty seven 28: twenty eight 30: thirty 25: twenty five 98: ninety eight 230: two hundred and thirty 11: eleven 30: thirty 28: twenty eight 1605: one thousand, six hundred and five 40: forty 16: sixteen 80: eighty 120: one hundred and twenty 8: eight 60: sixty 20: twenty 30: thirty 1989: one thousand, nine hundred and eighty nine 20: twenty 18: eighteen 1987: one thousand, nine hundred and eighty seven 1926: one thousand, nine hundred and twenty six 13: thirteen 14: fourteen 250: two hundred and fifty 150: one hundred and fifty 400: four hundred 500: five hundred 120: one hundred and twenty 40: forty 40: forty 7: seven 9: nine 12: twelve 30: thirty 90: ninety 15: fifteen 150: one hundred and fifty 100: one hundred 180: one hundred and eighty 000: zero 175: one hundred and seventy five 150: one hundred and fifty 1828: one thousand, eight hundred and twenty eight 70: seventy 56: fifty six 10: ten 15: fifteen 20: twenty 130: one hundred and thirty 15: fifteen 20: twenty 12: twelve 330: three hundred and thirty 400: four hundred 100: one hundred 100: one hundred 100: one hundred 1911: one thousand, nine hundred and eleven 10: ten 340: three hundred and forty 75: seventy five 60: sixty 1828: one thousand, eight hundred and twenty eight 70: seventy 14: fourteen 15: fifteen 1651: one thousand, six hundred and fifty one 95: ninety five 20: twenty 50: fifty 60: sixty 520: five hundred and twenty 750: seven hundred and fifty 1948: one thousand, nine hundred and forty eight 800: eight hundred 20: twenty 25: twenty five 25: twenty five 60: sixty 500: five hundred 000: zero 58: fifty eight 20: twenty 750: seven hundred and fifty 90: ninety 10: ten 15: fifteen 1: one 12: twelve 000: zero 000: zero 20: twenty 24: twenty four 34: thirty four 729: seven hundred and twenty nine 20: twenty 10: ten 100: one hundred 130: one hundred and thirty 26: twenty six 000: zero 800: eight hundred 37: thirty seven 34: thirty four 90: ninety 15: fifteen 465: four hundred and sixty five 25: twenty five 000: zero 800: eight hundred 15: fifteen 3: three 50: fifty 2: two 18: eighteen 400: four hundred 415: four hundred and fifteen 30: thirty 19: nineteen 23: twenty three 400: four hundred 1706: one thousand, seven hundred and six 16: sixteen 20: twenty 200: two hundred 21: twenty one 500: five hundred 125: one hundred and twenty five 45: forty five 28: twenty eight 29: twenty nine 54: fifty four 16: sixteen 6: six 60: sixty 20: twenty 67: sixty seven 22: twenty two 60: sixty 000: zero 15: fifteen 200: two hundred 5: five 600: six hundred 15: fifteen 1933: one thousand, nine hundred and thirty three 10: ten 40: forty 90: ninety 000: zero 100: one hundred 000: zero 55: fifty five 400: four hundred 30: thirty 000: zero 246: two hundred and forty six 68: sixty eight 30: thirty 135: one hundred and thirty five 160: one hundred and sixty 175: one hundred and seventy five 000: zero 000: zero 2006: two thousand and six 57: fifty seven 185: one hundred and eighty five 125: one hundred and twenty five 205: two hundred and five 950: nine hundred and fifty 60: sixty 125: one hundred and twenty five 450: four hundred and fifty 385: three hundred and eighty five 5: five 7: seven 3: three 31: thirty one 80: eighty 69: sixty nine 8: eight 000: zero 350: three hundred and fifty 400: four hundred 550: five hundred and fifty 600: six hundred 650: six hundred and fifty 000: zero 000: zero 165: one hundred and sixty five 15: fifteen 90: ninety 20: twenty 10: ten 85: eighty five 100: one hundred 2: two 120: one hundred and twenty 13: thirteen 260: two hundred and sixty 27: twenty seven 61: sixty one 300: three hundred 165: one hundred and sixty five 175: one hundred and seventy five 50: fifty 225: two hundred and twenty five 35: thirty five 35: thirty five 42: forty two 500: five hundred 70: seventy 700: seven hundred 245: two hundred and forty five 90: ninety 120: one hundred and twenty 200: two hundred 65: sixty five 50: fifty 2: two 60: sixty 8: eight 425: four hundred and twenty five 225: two hundred and twenty five 1: one 200: two hundred 99: ninety nine 50: fifty 000: zero 400: four hundred 500: five hundred 100: one hundred 300: three hundred 500: five hundred 99: ninety nine 3: three 1: one 35: thirty five 270: two hundred and seventy 32: thirty two 32: thirty two 30: thirty 20: twenty 130: one hundred and thirty 300: three hundred 23: twenty three 12: twelve 22: twenty two 20: twenty 22: twenty two 29: twenty nine 000: zero 60: sixty 10: ten 15: fifteen 239: two hundred and thirty nine 14: fourteen 250: two hundred and fifty 16: sixteen 350: three hundred and fifty 370: three hundred and seventy 350: three hundred and fifty 000: zero 99: ninety nine 53: fifty three 5: five 75: seventy five 40: forty 40: forty 30: thirty 2017: two thousand and seventeen 2: two 16: sixteen 18: eighteen 5: five 10: ten 20: twenty 6: six 15: fifteen 20: twenty 000: zero 1918: one thousand, nine hundred and eighteen 1923: one thousand, nine hundred and twenty three 53: fifty three 55: fifty five 130: one hundred and thirty 12: twelve 69: sixty nine 160: one hundred and sixty 36: thirty six 200: two hundred 100: one hundred 150: one hundred and fifty 95: ninety five 9: nine 155: one hundred and fifty five 10: ten 25: twenty five 30: thirty 235: two hundred and thirty five 125: one hundred and twenty five 110: one hundred and ten 1981: one thousand, nine hundred and eighty one 12: twelve 50: fifty 10: ten 1: one 1967: one thousand, nine hundred and sixty seven 90: ninety 400: four hundred 40: forty 4: four 000: zero 50: fifty 30: thirty 100: one hundred 25: twenty five 8: eight 1745: one thousand, seven hundred and forty five 000: zero 8: eight 5: five 2016: two thousand and sixteen 10: ten 1: one 15: fifteen 80: eighty 35: thirty five 20: twenty 100: one hundred 50: fifty 40: forty 10: ten 10: ten 620: six hundred and twenty 12: twelve 350: three hundred and fifty 500: five hundred 14: fourteen 10: ten 11: eleven 11: eleven 10: ten 647: six hundred and forty seven 101: one hundred and one 30: thirty 200: two hundred 200: two hundred 300: three hundred 140: one hundred and forty 1918: one thousand, nine hundred and eighteen 1961: one thousand, nine hundred and sixty one 1940: one thousand, nine hundred and forty 18: eighteen 250: two hundred and fifty 165: one hundred and sixty five 85: eighty five 42: forty two 820: eight hundred and twenty 44: forty four 250: two hundred and fifty 1759: one thousand, seven hundred and fifty nine 100: one hundred 15: fifteen 60: sixty 20: twenty 30: thirty 23: twenty three 13: thirteen 7: seven 000: zero 000: zero 9000: nine thousand 966: nine hundred and sixty six 100: one hundred 55: fifty five 15: fifteen 500: five hundred 14: fourteen 65: sixty five 96: ninety six 5: five 20: twenty 35: thirty five 200: two hundred 30: thirty 1940: one thousand, nine hundred and forty 15: fifteen 18: eighteen 5: five 30: thirty 20: twenty 100: one hundred 14: fourteen 95: ninety five 4: four 2: two 1960: one thousand, nine hundred and sixty 1963: one thousand, nine hundred and sixty three 57: fifty seven 900: nine hundred 6: six 90: ninety 40: forty 000: zero 2: two 30: thirty 000: zero 2009: two thousand and nine 10: ten 10: ten 40: forty 60: sixty 25: twenty five 35: thirty five 78: seventy eight 1826: one thousand, eight hundred and twenty six 12: twelve 6: six 25: twenty five 27: twenty seven 1: one 300: three hundred 400: four hundred 100: one hundred 60: sixty 425: four hundred and twenty five 000: zero 10: ten 13: thirteen 425: four hundred and twenty five 6: six 100: one hundred 70: seventy 6: six 100: one hundred 17: seventeen 100: one hundred 120: one hundred and twenty 50: fifty 5: five 12: twelve 75: seventy five 10: ten 209: two hundred and nine 50: fifty 260: two hundred and sixty 260: two hundred and sixty 160: one hundred and sixty 14: fourteen 50: fifty 26: twenty six 18: eighteen 8: eight 70: seventy 20: twenty 500: five hundred 1: one 3: three 12: twelve 1878: one thousand, eight hundred and seventy eight 30: thirty 40: forty 6: six 2015: two thousand and fifteen 7: seven 12: twelve 1977: one thousand, nine hundred and seventy seven 30: thirty 1954: one thousand, nine hundred and fifty four 20: twenty 12: twelve 2015: two thousand and fifteen 2: two 54: fifty four 10: ten 24: twenty four 300: three hundred 218: two hundred and eighteen 35: thirty five 1951: one thousand, nine hundred and fifty one 20: twenty 90: ninety 90: ninety 15: fifteen 854: eight hundred and fifty four 1985: one thousand, nine hundred and eighty five 20: twenty 000: zero 30: thirty 3: three 3: three 5: five 49: forty nine 350: three hundred and fifty 100: one hundred 200: two hundred 105: one hundred and five 12: twelve 1: one 5: five 400: four hundred 5: five 1: one 2: two 000: zero 2011: two thousand and eleven 1911: one thousand, nine hundred and eleven 1967: one thousand, nine hundred and sixty seven 15: fifteen 11: eleven 2016: two thousand and sixteen 646: six hundred and forty six 2: two 2: two 1973: one thousand, nine hundred and seventy three 65: sixty five 100: one hundred 150: one hundred and fifty 600: six hundred 400: four hundred 500: five hundred 1994: one thousand, nine hundred and ninety four 17: seventeen 30: thirty 15: fifteen 200: two hundred 300: three hundred 15: fifteen 2016: two thousand and sixteen 50: fifty 50: fifty 100: one hundred 520: five hundred and twenty 150: one hundred and fifty 2300: two thousand, three hundred 24: twenty four 15: fifteen 40: forty 10: ten 200: two hundred 1: one 700: seven hundred 200: two hundred 18: eighteen 15: fifteen 20: twenty 14: fourteen 69: sixty nine 3: three 200: two hundred 25: twenty five 930: nine hundred and thirty 10: ten 2016: two thousand and sixteen 2016: two thousand and sixteen 2016: two thousand and sixteen 180: one hundred and eighty 13: thirteen 7: seven 1852: one thousand, eight hundred and fifty two 1: one 35: thirty five 150: one hundred and fifty 42: forty two 2: two 100: one hundred 70: seventy 100: one hundred 20: twenty 30: thirty 4: four 60: sixty 100: one hundred 100: one hundred 700: seven hundred 20: twenty 000: zero 70: seventy 1: one 20: twenty 65: sixty five 70: seventy 20: twenty 20: twenty 100: one hundred 000: zero 30: thirty 110: one hundred and ten 22: twenty two 24: twenty four 120: one hundred and twenty 10: ten 12: twelve 14: fourteen 5: five 31: thirty one 12: twelve 7: seven 9: nine 2: two 74: seventy four 1: one 2009: two thousand and nine 40: forty 1969: one thousand, nine hundred and sixty nine 1998: one thousand, nine hundred and ninety eight 14: fourteen 1973: one thousand, nine hundred and seventy three 000: zero 100: one hundred 25: twenty five 20: twenty 48: forty eight 17: seventeen 24: twenty four 4: four 000: zero 2: two 55: fifty five 4: four 24: twenty four 28: twenty eight 27: twenty seven 27: twenty seven 30: thirty 30: thirty 12: twelve 600: six hundred 500: five hundred 1835: one thousand, eight hundred and thirty five 22: twenty two 20: twenty 45: forty five 60: sixty 89: eighty nine 40: forty 0: zero 12: twelve 10: ten 1826: one thousand, eight hundred and twenty six 000: zero 5: five 1848: one thousand, eight hundred and forty eight 14: fourteen 16: sixteen 11: eleven 800: eight hundred 300: three hundred 000: zero 100: one hundred 600: six hundred 125: one hundred and twenty five 20: twenty 85: eighty five 40: forty 8: eight 18: eighteen 73: seventy three 40: forty 2: two 1948: one thousand, nine hundred and forty eight 80: eighty 500: five hundred 200: two hundred 400: four hundred 1948: one thousand, nine hundred and forty eight 20: twenty 1790: one thousand, seven hundred and ninety 1984: one thousand, nine hundred and eighty four 2: two 000: zero 400: four hundred 150: one hundred and fifty 15: fifteen 25: twenty five 000: zero 1: one 000: zero 47: forty seven 22: twenty two 5: five 40: forty 100: one hundred 40: forty 10: ten 2: two 1: one 49: forty nine 100: one hundred 350: three hundred and fifty 155: one hundred and fifty five 90: ninety 95: ninety five 000: zero 325: three hundred and twenty five 320: three hundred and twenty 325: three hundred and twenty five 10: ten 20: twenty 15: fifteen 30: thirty 80: eighty 60: sixty 15: fifteen 1: one 10: ten 20: twenty 2: two 1848: one thousand, eight hundred and forty eight 1842: one thousand, eight hundred and forty two 90: ninety 25: twenty five 30: thirty 10: ten
In [ ]:
# Tokenize all sentences in the dataframe
all_tokens = [token for sentence in df['sentence'] for token in nltk.word_tokenize(sentence)]
# Count the frequency of each token
token_counts = Counter(all_tokens)
# Get the top 20 most frequent tokens
common_tokens = token_counts.most_common(20)
# Set a professional color palette and style
sns.set_style("ticks", {"xtick.major.size": 8, "ytick.major.size": 8})
sns.set_context("talk", font_scale=0.8)
color = '#2980B9' # Slightly deeper shade of blue
# Plot
plt.figure(figsize=(13, 12))
# Plotting each bar with the refined color
tokens, frequencies = zip(*common_tokens)
for token, freq in common_tokens:
plt.barh(token, freq, color=color, edgecolor='silver', height=0.7)
plt.text(freq + 10, token, str(freq), va='center', color='black', fontsize=12) # Adjusted annotation
# Refining title and axis labels for a polished look
plt.title('Top 20 Most Frequent Tokens', fontsize=20, fontweight='bold', pad=20)
plt.xlabel('Frequency', fontsize=16)
plt.ylabel('Tokens', fontsize=16)
plt.gca().invert_yaxis() # To display the most frequent token at the top
# Introducing subtle gridlines for better mapping
plt.grid(axis='x', linestyle='--', alpha=0.6)
# Adjusting axis ticks for aesthetics
plt.xticks(fontsize=14)
plt.yticks(fontsize=14)
plt.tight_layout()
plt.show()
In [ ]:
df.head()
Out[ ]:
| sentence | |
|---|---|
| 0 | WHEN YOU'RE COOKING CHIPS AT HOME |
| 1 | THE TRADITIONAL CHIP PAN OFTEN STAYS ON THE SHELF |
| 2 | THROUGH WHAT THEY CALL A KNIFE BLOCK |
| 3 | WHICH INVOLVES FIRING A POTATO DOWN A PIPE |
| 4 | APART FROM THE GOLDEN COLOUR AND THE DELICIOUS... |
In [ ]:
for percentile in [25, 50, 75, 90, 95, 99]:
print(f"{percentile}th percentile:", np.percentile(sentence_lengths, percentile))
unique_words = set(word for sentence in df['sentence'] for word in sentence.split())
print("Total unique words:", len(unique_words))
word_counts = Counter(word for sentence in df['sentence'] for word in sentence.split())
print("Most common words:", word_counts.most_common(10))
print("Least common words:", word_counts.most_common()[:-11:-1])
25th percentile: 5.0
50th percentile: 6.0
75th percentile: 9.0
90th percentile: 13.0
95th percentile: 16.0
99th percentile: 20.0
Total unique words: 17388
Most common words: [('THE', 16538), ('TO', 9609), ('A', 8610), ('AND', 8595), ('OF', 7332), ('I', 5829), ('IT', 5226), ('IN', 5052), ('THAT', 4827), ('YOU', 4757)]
Least common words: [('SEIZURES', 1), ('PERSUADERS', 1), ('BANKRUPTING', 1), ('REWROTE', 1), ('FLAWS', 1), ('RHINE', 1), ('BROCKEN', 1), ('CROWDED', 1), ("TROTSKY'S", 1), ('UNISON', 1)]
In [ ]:
# Define a pattern for common contractions
common_contractions_pattern = r"\b(?:[a-zA-Z]+n't|[a-zA-Z]+'ll|[a-zA-Z]+'ve|[a-zA-Z]+'re|[a-zA-Z]+'d|[a-zA-Z]+'s)\b"
# Find common contractions in each line and store them
contractions_counter = Counter()
for line in df['sentence']:
contractions_counter.update(re.findall(common_contractions_pattern, line.lower()))
# Get the most common contractions and their counts
most_common_contractions = contractions_counter.most_common()
# Calculate total contractions found
total_contractions = sum(contractions_counter.values())
most_common_contractions, total_contractions
Out[ ]:
([("it's", 2445),
("that's", 1015),
("don't", 978),
("you're", 522),
("i've", 494),
("we've", 492),
("there's", 422),
("we're", 416),
("they're", 391),
("let's", 358),
("you've", 345),
("can't", 313),
("he's", 312),
("didn't", 258),
("i'll", 201),
("i'd", 187),
("she's", 186),
("what's", 183),
("wasn't", 177),
("doesn't", 163),
("they've", 157),
("we'll", 155),
("wouldn't", 122),
("haven't", 100),
("won't", 98),
("you'll", 89),
("couldn't", 85),
("isn't", 85),
("today's", 80),
("you'd", 76),
("they'll", 68),
("we'd", 68),
("he'd", 53),
("weren't", 49),
("aren't", 41),
("they'd", 41),
("who's", 41),
("it'll", 38),
("here's", 35),
("hadn't", 30),
("year's", 27),
("britain's", 26),
("tonight's", 26),
("world's", 25),
("people's", 23),
("shouldn't", 22),
("everyone's", 21),
("hasn't", 20),
("he'll", 19),
("everybody's", 18),
("would've", 18),
("she'd", 15),
("life's", 12),
("mother's", 11),
("children's", 11),
("father's", 11),
("week's", 11),
("who've", 10),
("someone's", 9),
("wife's", 9),
("women's", 9),
("ain't", 9),
("man's", 9),
("nation's", 9),
("bbc's", 8),
("it'd", 8),
("she'll", 8),
("one's", 7),
("name's", 7),
("weekend's", 7),
("how's", 7),
("dad's", 7),
("night's", 7),
("that'll", 7),
("london's", 6),
("king's", 6),
("mum's", 6),
("where's", 6),
("time's", 6),
("matt's", 5),
("thing's", 5),
("market's", 5),
("weather's", 5),
("everything's", 5),
("there'll", 5),
("paul's", 5),
("bradshaw's", 5),
("queen's", 5),
("daren't", 4),
("europe's", 4),
("boy's", 4),
("country's", 4),
("nature's", 4),
("else's", 4),
("england's", 4),
("men's", 4),
("tv's", 4),
("team's", 4),
("something's", 4),
("somebody's", 4),
("work's", 4),
("phil's", 4),
("webster's", 4),
("shakespeare's", 4),
("peter's", 4),
("month's", 3),
("other's", 3),
("anything's", 3),
("dave's", 3),
("town's", 3),
("city's", 3),
("god's", 3),
("who'd", 3),
("woman's", 3),
("uk's", 3),
("kate's", 3),
("henry's", 3),
("island's", 3),
("county's", 3),
("girl's", 3),
("day's", 3),
("charlie's", 3),
("nobody's", 3),
("david's", 3),
("bid's", 3),
("grandmother's", 3),
("gentleman's", 3),
("tom's", 3),
("tomorrow's", 3),
("harm's", 3),
("edward's", 3),
("hogarth's", 3),
("mustn't", 3),
("brother's", 3),
("family's", 3),
("sun's", 2),
("soldier's", 2),
("should've", 2),
("son's", 2),
("show's", 2),
("christ's", 2),
("lawrence's", 2),
("money's", 2),
("planet's", 2),
("thomas's", 2),
("person's", 2),
("company's", 2),
("majesty's", 2),
("individual's", 2),
("buyer's", 2),
("mistress's", 2),
("george's", 2),
("pam's", 2),
("labour's", 2),
("club's", 2),
("miranda's", 2),
("centurion's", 2),
("john's", 2),
("gourmet's", 2),
("shan't", 2),
("november's", 2),
("spencer's", 2),
("jack's", 2),
("farming's", 2),
("maker's", 2),
("jesus's", 2),
("brand's", 2),
("rhod's", 2),
("mark's", 2),
("there'd", 2),
("when's", 2),
("valentine's", 2),
("whatever's", 2),
("busman's", 2),
("relief's", 2),
("item's", 2),
("oak's", 2),
("lee's", 2),
("georgie's", 2),
("summer's", 2),
("shepherd's", 2),
("nash's", 2),
("animal's", 2),
("alzheimer's", 2),
("doctor's", 2),
("husband's", 2),
("bobby's", 2),
("america's", 2),
("cathedral's", 2),
("gentlemen's", 2),
("tim's", 2),
("could've", 2),
("daddy's", 2),
("mick's", 2),
("emma's", 2),
("yesterday's", 2),
("television's", 2),
("anybody's", 2),
("agency's", 2),
("roscoff's", 2),
("paula's", 2),
("lady's", 2),
("saleroom's", 2),
("pete's", 2),
("goat's", 2),
("gully's", 1),
("sheep's", 1),
("later's", 1),
("barr's", 1),
("gaynor's", 1),
("bar's", 1),
("church's", 1),
("rachel's", 1),
("age's", 1),
("galileo's", 1),
("jennifer's", 1),
("kathy's", 1),
("titchmarsh's", 1),
("century's", 1),
("conqueror's", 1),
("dermot's", 1),
("damien's", 1),
("bohemond's", 1),
("marconi's", 1),
("annie's", 1),
("richard's", 1),
("topography's", 1),
("owner's", 1),
("chief's", 1),
("handler's", 1),
("hunt's", 1),
("government's", 1),
("riding's", 1),
("nhs'll", 1),
("katy's", 1),
("sotheby's", 1),
("eyre's", 1),
("cromwell's", 1),
("spix's", 1),
("nic's", 1),
("dealer's", 1),
("parent's", 1),
("frank's", 1),
("legion's", 1),
("derbyshire's", 1),
("cassini's", 1),
("newborn's", 1),
("garrow's", 1),
("clive's", 1),
("neck's", 1),
("edmund's", 1),
("channel's", 1),
("cartland's", 1),
("howard's", 1),
("bpa's", 1),
("wren's", 1),
("eamonn's", 1),
("daimler's", 1),
("juana's", 1),
("barrow's", 1),
("holly's", 1),
("sue's", 1),
("flavour's", 1),
("so's", 1),
("martin's", 1),
("hancock's", 1),
("smith's", 1),
("mankind's", 1),
("value's", 1),
("phone's", 1),
("eric's", 1),
("gillian's", 1),
("author's", 1),
("victoria's", 1),
("pamela's", 1),
("hour's", 1),
("grandfather's", 1),
("wheatley's", 1),
("jackie's", 1),
("malta's", 1),
("gormley's", 1),
("deer's", 1),
("rate's", 1),
("dunbar's", 1),
("anyone's", 1),
("sande's", 1),
("principle's", 1),
("gordon's", 1),
("julia's", 1),
("think's", 1),
("margaret's", 1),
("gabby's", 1),
("ronnie's", 1),
("baxter's", 1),
("canopy's", 1),
("bird's", 1),
("minton's", 1),
("alexandra's", 1),
("clerk's", 1),
("tb's", 1),
("chemist's", 1),
("fermi's", 1),
("jeanette's", 1),
("macmillan's", 1),
("drake's", 1),
("bottom's", 1),
("watkins's", 1),
("peterborough's", 1),
("linda's", 1),
("churchill's", 1),
("band's", 1),
("liverpool's", 1),
("bretby's", 1),
("auction's", 1),
("kitchener's", 1),
("blacksmith's", 1),
("constantine's", 1),
("justinian's", 1),
("orwell's", 1),
("roadshow's", 1),
("emperor's", 1),
("b's", 1),
("boudicca's", 1),
("part's", 1),
("alan's", 1),
("mortimer's", 1),
("commander's", 1),
("this'll", 1),
("daphne's", 1),
("chris's", 1),
("vicar's", 1),
("teddy's", 1),
("rome's", 1),
("devon's", 1),
("clayton's", 1),
("adam's", 1),
("nottingham's", 1),
("hollywood's", 1),
("andrew's", 1),
("denny's", 1),
("derby's", 1),
("that'd", 1),
("director's", 1),
("driver's", 1),
("ship's", 1),
("pop's", 1),
("sullivan's", 1),
("jamie's", 1),
("betty's", 1),
("dad'll", 1),
("lalique's", 1),
("laura's", 1),
("suzanne's", 1),
("jaguar's", 1),
("kat's", 1),
("kerr's", 1),
("tennyson's", 1),
("past's", 1),
("peacock's", 1),
("cow's", 1),
("parson's", 1),
("caroline's", 1),
("fire's", 1),
("friend's", 1),
("salesmen's", 1),
("darren's", 1),
("original's", 1),
("bernice's", 1),
("empire's", 1),
("marie's", 1),
("saul's", 1),
("canine's", 1),
("charlotte's", 1),
("farm's", 1),
("giant's", 1),
("damian's", 1),
("foxe's", 1),
("barbara's", 1),
("builder's", 1),
("edith's", 1),
("decision's", 1),
("ve'll", 1),
("hamish's", 1),
("tree's", 1),
("mcclintock's", 1),
("prince's", 1),
("cheque's", 1),
("australia's", 1),
("music's", 1),
("russell's", 1),
("hairdresser's", 1),
("lucy's", 1),
("cadbury's", 1),
("water's", 1),
("devil's", 1),
("venue's", 1),
("artist's", 1),
("beard's", 1),
("germany's", 1),
("juliet's", 1),
("player's", 1),
("torrin's", 1),
("hackman's", 1),
("photographer's", 1),
("madeira's", 1),
("monk's", 1),
("trinian's", 1),
("pont's", 1),
("tyler's", 1),
("love's", 1),
("naani's", 1),
("heston's", 1),
("mayor's", 1),
("scotland's", 1),
("chain's", 1),
("philip's", 1),
("tripper's", 1),
("len's", 1),
("building's", 1),
("byron's", 1),
("gear's", 1),
("limestone's", 1),
("mary's", 1),
("asprey's", 1),
("workmen's", 1),
("snake's", 1),
("washington's", 1),
("astley's", 1),
("smart's", 1),
("oakey's", 1),
("castle's", 1),
("miner's", 1),
("kent's", 1),
("story's", 1),
("mexico's", 1),
("collector's", 1),
("pm's", 1),
("fiction's", 1),
("ballard's", 1),
("wilson's", 1),
("gaulle's", 1),
("sony's", 1),
("korea's", 1),
("auctioneer's", 1),
("jessica's", 1),
("donkey's", 1),
("audrey's", 1),
("rodney's", 1),
("sharon's", 1),
("car's", 1),
("relative's", 1),
("france's", 1),
("bloke's", 1),
("catherine's", 1),
("merchant's", 1),
("kathleen's", 1),
("calm's", 1),
("rspb's", 1),
("viii's", 1),
("glitter's", 1),
("hartley's", 1),
("debbie's", 1),
("aim's", 1),
("grandma's", 1),
("heart's", 1),
("bertie's", 1),
("saddle's", 1),
("firm's", 1),
("machine's", 1),
("manor's", 1),
("ted's", 1),
("sunderland's", 1),
("cabot's", 1),
("tot's", 1),
("belfort's", 1),
("fisherman's", 1),
("half's", 1),
("season's", 1),
("frost's", 1),
("client's", 1),
("corvette's", 1),
("people've", 1),
("publisher's", 1),
("cameron's", 1),
("where'd", 1),
("adrian's", 1),
("julie's", 1),
("eve's", 1),
("clarkson's", 1),
("payer's", 1),
("hammer's", 1),
("hepburn's", 1),
("peck's", 1),
("evil's", 1),
("sandy's", 1),
("clare's", 1),
("barry's", 1),
("hitler's", 1),
("leg's", 1),
("spock's", 1),
("poppy's", 1),
("cinema's", 1),
("lord's", 1),
("morsi's", 1),
("incedal's", 1),
("now's", 1),
("generation's", 1),
("community's", 1),
("why've", 1),
("ben's", 1),
("photo's", 1),
("grainger's", 1),
("evening's", 1),
("couple's", 1),
("grace's", 1),
("store's", 1),
("brahms's", 1),
("fox's", 1),
("wellington's", 1),
("forum's", 1),
("property's", 1),
("bathroom's", 1),
("sunday's", 1),
("bill's", 1),
("crew's", 1),
("who'll", 1),
("teacher's", 1),
("justin's", 1),
("there've", 1),
("roman's", 1),
("dante's", 1),
("sailor's", 1),
("eva's", 1),
("monica's", 1),
("jade's", 1),
("mar's", 1),
("moorcroft's", 1),
("jay's", 1),
("military's", 1),
("hitchhiker's", 1),
("pilot's", 1),
("duxford's", 1),
("veteran's", 1),
("ireland's", 1),
("tea's", 1),
("graham's", 1),
("shazia's", 1),
("helen's", 1),
("bishop's", 1),
("beeching's", 1),
("might've", 1),
("jenny's", 1),
("jonathan's", 1),
("monday's", 1),
("control's", 1),
("adele's", 1),
("parkinson's", 1),
("stephen's", 1),
("savile's", 1),
("gilding's", 1),
("owen's", 1),
("professor's", 1),
("olympian's", 1),
("hodgkin's", 1),
("trump's", 1),
("eleanor's", 1),
("craig's", 1),
("alia's", 1),
("ram's", 1),
("college's", 1),
("harrison's", 1),
("pat's", 1),
("sister's", 1),
("practice's", 1),
("madonna's", 1),
("january's", 1),
("museum's", 1),
("madge's", 1),
("rene's", 1),
("reader's", 1),
("brian's", 1),
("flossy's", 1),
("countryfile's", 1),
("kevin's", 1),
("hubble's", 1),
("bang's", 1),
("alexander's", 1),
("aleksandr's", 1),
("moscow's", 1),
("harold's", 1),
("arctic's", 1),
("technology's", 1),
("patient's", 1),
("cbbc's", 1),
("charity's", 1),
("dude's", 1),
("janet's", 1),
("hand's", 1),
("dot's", 1),
("economy's", 1),
("william's", 1),
("sian's", 1),
("braxton's", 1),
("weston's", 1),
("tumour's", 1),
("gina's", 1),
("candidate's", 1),
("must've", 1),
("madeline's", 1),
("diamond's", 1),
("hammock's", 1),
("polo's", 1),
("humanity's", 1),
("maxwell's", 1),
("university's", 1),
("whoever's", 1),
("gregg's", 1),
("trotsky's", 1)],
12608)
In [ ]:
# Calculate word count for sentences
df['word_count'] = df['sentence'].apply(lambda x: len(x.split()))
# Print statistics on word counts
print(df['word_count'].describe())
# Visualization: Histograms of sentence word counts
plt.hist(df['word_count'], bins=20, alpha=0.7)
plt.title('Word Counts in Sentences')
plt.xlabel('Word Count')
plt.ylabel('Frequency')
plt.show()
count 45839.000000 mean 7.231702 std 3.770229 min 3.000000 25% 4.000000 50% 6.000000 75% 9.000000 max 28.000000 Name: word_count, dtype: float64
In [ ]:
# Create inflect engine once
p = inflect.engine()
def convert_numerical_ordinals_to_words(text):
words = text.split()
for i, word in enumerate(words):
# Removing punctuation for better matching
clean_word = word.rstrip(string.punctuation)
if match(r'\d+(st|nd|rd|th)', clean_word):
number = match(r'\d+', clean_word).group()
word_ordinal = p.number_to_words(int(number), ordinal=True, andword=' ', zero='zero', one='one')
# Retain the punctuation after conversion
punctuation = word[len(clean_word):]
word_ordinal += punctuation
words[i] = word_ordinal
return ' '.join(words)
In [ ]:
# Convert any numerical ordinals in the sentences to their word form
df['sentence'] = df['sentence'].apply(convert_numerical_ordinals_to_words)
# Display the first few rows to verify the changes
print(df.head())
sentence word_count 0 WHEN YOU'RE COOKING CHIPS AT HOME 6 1 THE TRADITIONAL CHIP PAN OFTEN STAYS ON THE SHELF 9 2 THROUGH WHAT THEY CALL A KNIFE BLOCK 7 3 WHICH INVOLVES FIRING A POTATO DOWN A PIPE 8 4 APART FROM THE GOLDEN COLOUR AND THE DELICIOUS... 9
In [ ]:
df_original = df.copy(deep=True)
In [ ]:
# Backup the sentences before conversion
df['original_sentence'] = df['sentence'].copy()
# Display a few randomly selected original and converted sentences for comparison
sample_sentences = df.sample(10)
for index, row in sample_sentences.iterrows():
print(f"Original: {row['original_sentence']}")
print(f"Converted: {row['sentence']}")
print("------")
# Compute statistics
df['word_count_after_conversion'] = df['sentence'].apply(lambda x: len(x.split()))
print("\nStatistics after conversion:")
print(df['word_count_after_conversion'].describe())
# Visualization: Histograms of sentence lengths after conversion
plt.hist(df['word_count_after_conversion'], bins=20, alpha=0.7, color='blue', label='After Conversion')
plt.hist(df['word_count'], bins=20, alpha=0.7, color='red', label='Before Conversion')
plt.title('Sentence Lengths Comparison')
plt.xlabel('Length (words)')
plt.ylabel('Frequency')
plt.legend()
plt.show()
Original: CAN WE FORGET ABOUT THE PRICE TAG Converted: CAN WE FORGET ABOUT THE PRICE TAG ------ Original: BUT IT IS A PLEASURE TO SIT THERE AND SEE WHAT TURNS UP Converted: BUT IT IS A PLEASURE TO SIT THERE AND SEE WHAT TURNS UP ------ Original: MY REAL NAME IS BASIL DEVERE COURTNEY Converted: MY REAL NAME IS BASIL DEVERE COURTNEY ------ Original: SO FOR EVERY one hundred Converted: SO FOR EVERY one hundred ------ Original: THEY'RE NOT SECOND HAND OR THIRD HAND Converted: THEY'RE NOT SECOND HAND OR THIRD HAND ------ Original: THERE IS A RARITY FACTOR Converted: THERE IS A RARITY FACTOR ------ Original: WHY DON'T WE HAVE A LOOK AT HOW POOR COLIN YOUNG IS GETTING ON WITH THE BLUE TEAM'S BONUS Converted: WHY DON'T WE HAVE A LOOK AT HOW POOR COLIN YOUNG IS GETTING ON WITH THE BLUE TEAM'S BONUS ------ Original: THE GAME GREW IN POPULARITY Converted: THE GAME GREW IN POPULARITY ------ Original: AS SOON AS THE DATE WAS ANNOUNCED Converted: AS SOON AS THE DATE WAS ANNOUNCED ------ Original: I'D SAY fifty TO eighty Converted: I'D SAY fifty TO eighty ------ Statistics after conversion: count 45839.000000 mean 7.231702 std 3.770229 min 3.000000 25% 4.000000 50% 6.000000 75% 9.000000 max 28.000000 Name: word_count_after_conversion, dtype: float64
In [ ]:
#Create a copy of the dataframe
df_copy = df.copy()
#Compare and create a 'changed' column
df['changed'] = df['sentence'] != df_copy['sentence']
# Obtain statistics
changed_count = df['changed'].sum()
unchanged_count = len(df) - changed_count
print(f"Number of sentences that changed: {changed_count}")
print(f"Number of sentences that remained unchanged: {unchanged_count}")
Number of sentences that changed: 0 Number of sentences that remained unchanged: 45839
In [ ]:
# List to store words that were converted
converted_words_list = []
# Iterate through each row of the dataframe
for index, row in df.iterrows():
original_words = df_copy.loc[index, 'sentence'].split()
converted_words = row['sentence'].split()
for orig, conv in zip(original_words, converted_words):
if orig != conv:
converted_words_list.append((orig, conv))
# Count the occurrence of each conversion
conversion_counter = Counter(converted_words_list)
# Display the most common conversions
common_conversions = conversion_counter.most_common()
print("Most common word conversions:")
for conversion, count in common_conversions:
orig, conv = conversion
print(f"{orig} -> {conv}: {count} times")
Most common word conversions:
In [ ]:
# copy df_expanded to df_before_token
df_before_token = df.copy()
In [ ]:
print(df.head())
sentence word_count \
0 WHEN YOU'RE COOKING CHIPS AT HOME 6
1 THE TRADITIONAL CHIP PAN OFTEN STAYS ON THE SHELF 9
2 THROUGH WHAT THEY CALL A KNIFE BLOCK 7
3 WHICH INVOLVES FIRING A POTATO DOWN A PIPE 8
4 APART FROM THE GOLDEN COLOUR AND THE DELICIOUS... 9
original_sentence \
0 WHEN YOU'RE COOKING CHIPS AT HOME
1 THE TRADITIONAL CHIP PAN OFTEN STAYS ON THE SHELF
2 THROUGH WHAT THEY CALL A KNIFE BLOCK
3 WHICH INVOLVES FIRING A POTATO DOWN A PIPE
4 APART FROM THE GOLDEN COLOUR AND THE DELICIOUS...
word_count_after_conversion changed
0 6 False
1 9 False
2 7 False
3 8 False
4 9 False
In [ ]:
df['sentence'] = df['sentence'].str.lower()
df.head()
Out[ ]:
| sentence | word_count | original_sentence | word_count_after_conversion | changed | |
|---|---|---|---|---|---|
| 0 | when you're cooking chips at home | 6 | WHEN YOU'RE COOKING CHIPS AT HOME | 6 | False |
| 1 | the traditional chip pan often stays on the shelf | 9 | THE TRADITIONAL CHIP PAN OFTEN STAYS ON THE SHELF | 9 | False |
| 2 | through what they call a knife block | 7 | THROUGH WHAT THEY CALL A KNIFE BLOCK | 7 | False |
| 3 | which involves firing a potato down a pipe | 8 | WHICH INVOLVES FIRING A POTATO DOWN A PIPE | 8 | False |
| 4 | apart from the golden colour and the delicious... | 9 | APART FROM THE GOLDEN COLOUR AND THE DELICIOUS... | 9 | False |
In [ ]:
df.head()
Out[ ]:
| sentence | word_count | original_sentence | word_count_after_conversion | changed | |
|---|---|---|---|---|---|
| 0 | when you're cooking chips at home | 6 | WHEN YOU'RE COOKING CHIPS AT HOME | 6 | False |
| 1 | the traditional chip pan often stays on the shelf | 9 | THE TRADITIONAL CHIP PAN OFTEN STAYS ON THE SHELF | 9 | False |
| 2 | through what they call a knife block | 7 | THROUGH WHAT THEY CALL A KNIFE BLOCK | 7 | False |
| 3 | which involves firing a potato down a pipe | 8 | WHICH INVOLVES FIRING A POTATO DOWN A PIPE | 8 | False |
| 4 | apart from the golden colour and the delicious... | 9 | APART FROM THE GOLDEN COLOUR AND THE DELICIOUS... | 9 | False |
In [ ]:
# Load the CMU Pronunciation Dictionary
pronunciation_dict = cmudict.dict()
# Initialize the g2p converter
g2p = G2p()
def tokenize_and_lowercase_text(text):
"""Tokenize and lowercase text."""
# Replace newline characters with space
text = text.replace('\n', ' ')
# Expand contractions
text = contractions.fix(text)
# Handle decades
text = re.sub(r'(\d+)(s)', r'\1 \2', text)
# Tokenize text
tokens = nltk.word_tokenize(text)
# Lowercase tokens
tokens = [token.lower() for token in tokens]
return tokens
def words_to_phonemes(words):
phonemes = []
for word in words:
if word in ['.', ',', '?', '!', ':', ';']:
phonemes.append('<space>')
else:
if word in pronunciation_dict:
phonemes.extend(pronunciation_dict[word][0])
phonemes.append('<space>')
elif word == "'":
pass
else:
phonemes.extend(g2p(word))
phonemes.append('<space>')
return phonemes
def process_sentence(sentence):
try:
# Tokenize and lowercase text
tokenized_sentence = tokenize_and_lowercase_text(sentence)
# Convert words to phonemes
phonemes = words_to_phonemes(tokenized_sentence)
phonemes = ['<sos>'] + phonemes[:-1] + ['<eos>']
return phonemes
except Exception as e:
print(f"Error processing sentence: {sentence}")
print(e)
return None
def expand_contractions(text):
"""Expand contractions in a text."""
return contractions.fix(text)
# Expand contractions in the sentence column
df['sentence'] = df['sentence'].apply(expand_contractions)
# Then apply the tokenization and phoneme conversion processes as before
with Pool() as pool:
df['phonemes'] = pool.map(process_sentence, df['sentence'])
print(df.head())
# Inspect the data
# Check the sentences where the <space> token is not present or is present less frequently than expected
df['word_count'] = df['sentence'].apply(lambda x: len(x.split()))
df['num_spaces'] = df['phonemes'].apply(lambda x: x.count('<space>'))
unusual_sentences = df[df['num_spaces'] < df['word_count'] - 1]
print(unusual_sentences)
sentence word_count \
0 when you are cooking chips at home 6
1 the traditional chip pan often stays on the shelf 9
2 through what they call a knife block 7
3 which involves firing a potato down a pipe 8
4 apart from the golden colour and the delicious... 9
original_sentence \
0 WHEN YOU'RE COOKING CHIPS AT HOME
1 THE TRADITIONAL CHIP PAN OFTEN STAYS ON THE SHELF
2 THROUGH WHAT THEY CALL A KNIFE BLOCK
3 WHICH INVOLVES FIRING A POTATO DOWN A PIPE
4 APART FROM THE GOLDEN COLOUR AND THE DELICIOUS...
word_count_after_conversion changed \
0 6 False
1 9 False
2 7 False
3 8 False
4 9 False
phonemes
0 [<sos>, W, EH1, N, <space>, Y, UW1, <space>, A...
1 [<sos>, DH, AH0, <space>, T, R, AH0, D, IH1, S...
2 [<sos>, TH, R, UW1, <space>, W, AH1, T, <space...
3 [<sos>, W, IH1, CH, <space>, IH0, N, V, AA1, L...
4 [<sos>, AH0, P, AA1, R, T, <space>, F, R, AH1,...
Empty DataFrame
Columns: [sentence, word_count, original_sentence, word_count_after_conversion, changed, phonemes, num_spaces]
Index: []
In [ ]:
# Sample 10 random sentences from the dataset
sample_sentences = df['sentence'].sample(10)
token_counts = [len(tokenize_and_lowercase_text(sentence)) for sentence in sample_sentences]
sentence_counts = [len(sentence.split()) for sentence in sample_sentences]
# Bar Chart
index = range(len(sample_sentences))
bar_width = 0.35
fig, ax = plt.subplots(figsize=(12, 6))
bar1 = ax.bar(index, sentence_counts, bar_width, label='Original Word Count', color='#3498DB', edgecolor='black')
bar2 = ax.bar([i + bar_width for i in index], token_counts, bar_width, label='Tokenized Word Count', color='#E74C3C', edgecolor='black')
ax.set_xlabel('Sentences')
ax.set_ylabel('Word Count')
ax.set_title('Comparison of Word Counts Before and After Tokenization')
ax.set_xticks([i + bar_width for i in index])
ax.set_xticklabels(['Sentence ' + str(i+1) for i in index], rotation=45)
ax.legend()
plt.tight_layout()
plt.show()
# Annotated Text Display
for index, sentence in enumerate(sample_sentences[:2]):
tokens = tokenize_and_lowercase_text(sentence)
print(f"Sentences {index+1}:")
print(f"Original: {sentence}")
print(f"Tokenized: {tokens}")
print("-"*100)
Sentences 1: Original: that is always fascinated me Tokenized: ['that', 'is', 'always', 'fascinated', 'me'] ---------------------------------------------------------------------------------------------------- Sentences 2: Original: which means the light comes from hot Tokenized: ['which', 'means', 'the', 'light', 'comes', 'from', 'hot'] ----------------------------------------------------------------------------------------------------
In [ ]:
# Convert the list of phonemes in the 'phonemes' column to a space-separated string
df['phonemes_str'] = df['phonemes'].str.join(' ')
# Create a function to display the dataframe without truncation
def display_full_dataframe(dataframe):
with pd.option_context('display.max_rows', None, 'display.max_columns', None, 'display.max_colwidth', None):
display(dataframe)
# Use the function to display the first 5 rows
display_full_dataframe(df[["phonemes_str"]].head())
| phonemes_str | |
|---|---|
| 0 | <sos> W EH1 N <space> Y UW1 <space> AA1 R <space> K UH1 K IH0 NG <space> CH IH1 P S <space> AE1 T <space> HH OW1 M <eos> |
| 1 | <sos> DH AH0 <space> T R AH0 D IH1 SH AH0 N AH0 L <space> CH IH1 P <space> P AE1 N <space> AO1 F AH0 N <space> S T EY1 Z <space> AA1 N <space> DH AH0 <space> SH EH1 L F <eos> |
| 2 | <sos> TH R UW1 <space> W AH1 T <space> DH EY1 <space> K AO1 L <space> AH0 <space> N AY1 F <space> B L AA1 K <eos> |
| 3 | <sos> W IH1 CH <space> IH0 N V AA1 L V Z <space> F AY1 R IH0 NG <space> AH0 <space> P AH0 T EY1 T OW2 <space> D AW1 N <space> AH0 <space> P AY1 P <eos> |
| 4 | <sos> AH0 P AA1 R T <space> F R AH1 M <space> DH AH0 <space> G OW1 L D AH0 N <space> K AH1 L AW0 R <space> AH0 N D <space> DH AH0 <space> D IH0 L IH1 SH AH0 S <space> F L AE1 V ER0 <eos> |
In [ ]:
df.head()
Out[ ]:
| sentence | word_count | original_sentence | word_count_after_conversion | changed | phonemes | num_spaces | phonemes_str | |
|---|---|---|---|---|---|---|---|---|
| 0 | when you are cooking chips at home | 7 | WHEN YOU'RE COOKING CHIPS AT HOME | 6 | False | [<sos>, W, EH1, N, <space>, Y, UW1, <space>, A... | 6 | <sos> W EH1 N <space> Y UW1 <space> AA1 R <spa... |
| 1 | the traditional chip pan often stays on the shelf | 9 | THE TRADITIONAL CHIP PAN OFTEN STAYS ON THE SHELF | 9 | False | [<sos>, DH, AH0, <space>, T, R, AH0, D, IH1, S... | 8 | <sos> DH AH0 <space> T R AH0 D IH1 SH AH0 N AH... |
| 2 | through what they call a knife block | 7 | THROUGH WHAT THEY CALL A KNIFE BLOCK | 7 | False | [<sos>, TH, R, UW1, <space>, W, AH1, T, <space... | 6 | <sos> TH R UW1 <space> W AH1 T <space> DH EY1 ... |
| 3 | which involves firing a potato down a pipe | 8 | WHICH INVOLVES FIRING A POTATO DOWN A PIPE | 8 | False | [<sos>, W, IH1, CH, <space>, IH0, N, V, AA1, L... | 7 | <sos> W IH1 CH <space> IH0 N V AA1 L V Z <spac... |
| 4 | apart from the golden colour and the delicious... | 9 | APART FROM THE GOLDEN COLOUR AND THE DELICIOUS... | 9 | False | [<sos>, AH0, P, AA1, R, T, <space>, F, R, AH1,... | 8 | <sos> AH0 P AA1 R T <space> F R AH1 M <space> ... |
In [ ]:
# Remove sentences that contain ' or space in the phonemes
df = df[~df['phonemes'].apply(lambda x: "'" in x or ' ' in x)]
In [ ]:
# Create a dictionary for removing stress markers
remove_stress_dict = {str(i): '' for i in range(10)}
def remove_stress(phonemes):
"""Remove stress markers from a list of phonemes."""
return [re.sub(r'\d', '', phoneme) for phoneme in phonemes]
def add_special_tokens(sentence):
"""Add special tokens to a sentence."""
return '<sos> ' + sentence.replace(' ', ' <space> ') + ' <eos>'
# Apply the function to the sentence column
df['sentence_with_tokens'] = df['sentence'].apply(add_special_tokens)
print(df[['sentence', 'sentence_with_tokens', 'phonemes']].sample(10))
# Apply the processing function
df['phonemes'] = df['phonemes'].apply(remove_stress)
# Sample Inspection
print(df[['sentence', 'phonemes']].sample(10))
# Distribution Analysis
df['phoneme_count'] = df['phonemes'].str.len()
print(df['phoneme_count'].describe())
# Special Tokens Check
wrong_start = df[df['phonemes'].str[0] != "<sos>"]
wrong_end = df[df['phonemes'].str[-1] != "<eos>"]
print(f"Number of sequences with wrong start: {len(wrong_start)}")
print(f"Number of sequences with wrong end: {len(wrong_end)}")
# Check for None values
none_sentences = df[df['phonemes'].apply(lambda x: None in x)]
print(f"Number of sentences with None values: {len(none_sentences)}")
# Frequency Analysis
all_phonemes = list(chain.from_iterable(df['phonemes']))
phoneme_freq = Counter(all_phonemes)
print("Most common phonemes:", phoneme_freq.most_common(10))
print("Least common phonemes:", phoneme_freq.most_common()[:-11:-1])
# Check if there are any missing phonemes
missing_phonemes = df[df['phonemes'].apply(lambda x: None in x)]
print(f"Number of sentences with missing phonemes: {len(missing_phonemes)}")
space_sentences = df[df['phonemes'].apply(lambda x: ' ' in x)]
print(space_sentences[['sentence', 'phonemes']])
<ipython-input-50-a357df83680a>:13: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy df['sentence_with_tokens'] = df['sentence'].apply(add_special_tokens)
sentence \
12522 tried to break it out of my hand
12508 do not you call a doctor
16858 we have got what we wanted
18655 two of our best teams will be competing agains...
1100 if you had clarissa beside you on the barricades
43334 you brought it in
40657 the dead do not come back
11112 by the famous dambusters
25708 if you clear your plate of sandwiches they kee...
17329 how are you today
sentence_with_tokens \
12522 <sos> tried <space> to <space> break <space> i...
12508 <sos> do <space> not <space> you <space> call ...
16858 <sos> we <space> have <space> got <space> what...
18655 <sos> two <space> of <space> our <space> best ...
1100 <sos> if <space> you <space> had <space> clari...
43334 <sos> you <space> brought <space> it <space> i...
40657 <sos> the <space> dead <space> do <space> not ...
11112 <sos> by <space> the <space> famous <space> da...
25708 <sos> if <space> you <space> clear <space> you...
17329 <sos> how <space> are <space> you <space> toda...
phonemes
12522 [<sos>, T, R, AY1, D, <space>, T, UW1, <space>...
12508 [<sos>, D, UW1, <space>, N, AA1, T, <space>, Y...
16858 [<sos>, W, IY1, <space>, HH, AE1, V, <space>, ...
18655 [<sos>, T, UW1, <space>, AH1, V, <space>, AW1,...
1100 [<sos>, IH1, F, <space>, Y, UW1, <space>, HH, ...
43334 [<sos>, Y, UW1, <space>, B, R, AO1, T, <space>...
40657 [<sos>, DH, AH0, <space>, D, EH1, D, <space>, ...
11112 [<sos>, B, AY1, <space>, DH, AH0, <space>, F, ...
25708 [<sos>, IH1, F, <space>, Y, UW1, <space>, K, L...
17329 [<sos>, HH, AW1, <space>, AA1, R, <space>, Y, ...
sentence \
21948 when you can actually hold the fabric that the...
4958 there is something about them which makes us f...
18610 bills of sale and receipts
43467 look out for the qr codes
30773 most influential figures in british comedy
7102 i cannot wait to see it and you can find out h...
14877 we are pinning all our hopes on the man with t...
5095 there is lots of smaller
25014 why do not you do the power test
38100 on that assumption
phonemes
21948 [<sos>, W, EH, N, <space>, Y, UW, <space>, K, ...
4958 [<sos>, DH, EH, R, <space>, IH, Z, <space>, S,...
18610 [<sos>, B, IH, L, Z, <space>, AH, V, <space>, ...
43467 [<sos>, L, UH, K, <space>, AW, T, <space>, F, ...
30773 [<sos>, M, OW, S, T, <space>, IH, N, F, L, UW,...
7102 [<sos>, AY, <space>, K, AE, N, <space>, N, AA,...
14877 [<sos>, W, IY, <space>, AA, R, <space>, P, IH,...
5095 [<sos>, DH, EH, R, <space>, IH, Z, <space>, L,...
25014 [<sos>, W, AY, <space>, D, UW, <space>, N, AA,...
38100 [<sos>, AA, N, <space>, DH, AE, T, <space>, AH...
count 45814.000000
mean 34.139040
std 17.523979
min 11.000000
25% 21.000000
50% 29.000000
75% 42.000000
max 141.000000
Name: phoneme_count, dtype: float64
Number of sequences with wrong start: 0
Number of sequences with wrong end: 0
Number of sentences with None values: 0
Most common phonemes: [('<space>', 299529), ('AH', 111029), ('T', 91599), ('N', 77726), ('IH', 75183), ('R', 52083), ('S', 50329), ('D', 47510), ('<sos>', 45814), ('<eos>', 45814)]
Least common phonemes: [('ZH', 444), ('OY', 1151), ('UH', 5864), ('JH', 6134), ('CH', 6196), ('TH', 6864), ('SH', 7392), ('AW', 8615), ('Y', 11279), ('G', 11628)]
Number of sentences with missing phonemes: 0
Empty DataFrame
Columns: [sentence, phonemes]
Index: []
In [ ]:
# Add tokens to sentences for comparison
def add_tokens_to_sentence(sentence):
return '<sos> ' + sentence.replace(' ', ' <space> ') + ' <eos>'
df['tokenized_sentence'] = df['sentence'].apply(add_tokens_to_sentence)
# Convert the list of phonemes to a space-separated string for display
df['phonemes_str'] = df['phonemes'].apply(lambda x: ' '.join(x))
# Display the tokenized sentences and their corresponding phonemes
sample_comparison = df[['tokenized_sentence', 'phonemes_str']].sample(5)
with pd.option_context('display.max_rows', None, 'display.max_colwidth', None):
display(sample_comparison)
| tokenized_sentence | phonemes_str | |
|---|---|---|
| 17720 | <sos> but <space> what <space> it <space> does <space> need <space> is <space> a <space> set <space> of <space> eyes <eos> | <sos> B AH T <space> W AH T <space> IH T <space> D AH Z <space> N IY D <space> IH Z <space> AH <space> S EH T <space> AH V <space> AY Z <eos> |
| 26979 | <sos> taken <space> very <space> seriously <eos> | <sos> T EY K AH N <space> V EH R IY <space> S IH R IY AH S L IY <eos> |
| 39564 | <sos> let <space> us <space> find <space> out <space> about <space> one <space> of <space> the <space> most <space> ancient <space> plants <space> on <space> the <space> planet <eos> | <sos> L EH T <space> AH S <space> F AY N D <space> AW T <space> AH B AW T <space> W AH N <space> AH V <space> DH AH <space> M OW S T <space> EY N CH AH N T <space> P L AE N T S <space> AA N <space> DH AH <space> P L AE N AH T <eos> |
| 32297 | <sos> he <space> might <space> also <space> have <space> been <space> quietly <space> beheaded <eos> | <sos> HH IY <space> M AY T <space> AO L S OW <space> HH AE V <space> B IH N <space> K W AY AH T L IY <space> B IH HH EH D IH D <eos> |
| 27232 | <sos> within <space> twenty <space> four <space> hours <eos> | <sos> W IH DH IH N <space> T W EH N T IY <space> F AO R <space> AW ER Z <eos> |
In [ ]:
def check_consecutive_special_tokens(sentence_sequences, phoneme_sequences):
special_tokens = ['<eos>', '<sos>', '<space>']
for seq in sentence_sequences:
for token in special_tokens:
if f"{token} {token}" in seq:
print(f"Consecutive {token} found in sentence: {seq}")
for seq in phoneme_sequences:
for token in special_tokens:
if f"{token} {token}" in ' '.join(seq):
print(f"Consecutive {token} found in phoneme: {' '.join(seq)}")
# Example usage:
check_consecutive_special_tokens(df['sentence_with_tokens'], df['phonemes'])
Consecutive <space> found in phoneme: <sos> DH AH <space> AA R CH ER Z <space> L AO S T <space> IH T S <space> EH JH AH K EY SH AH N AH L <space> P ER P AH S <space> IH N <space> W AH N <space> TH AW Z AH N D <space> <space> N AY N <space> HH AH N D R AH D <space> AH N D <space> S EH V AH N T IY <space> T UW <eos> Consecutive <space> found in phoneme: <sos> W IY <space> AA R <space> K AA N S AH N T R EY T IH NG <space> AA N <space> W AH N <space> TH AW Z AH N D <space> <space> N AY N <space> HH AH N D R AH D <space> AH N D <space> S IH K S T IY <space> F AO R <space> AH N D <space> W AH N <space> TH AW Z AH N D <space> <space> N AY N <space> HH AH N D R AH D <space> AH N D <space> S IH K S T IY <space> F AY V <space> AH N D <space> HH IY R IH NG <space> S AH M <space> M AO R <space> P ER S IH N IH L <space> S T AO R IY Z <space> AH V <eos> Consecutive <space> found in phoneme: <sos> L EH T <space> AH S <space> T EY K <space> AH <space> L UH K <space> AE T <space> S AH M <space> AH V <space> DH AH <space> AH DH ER <space> N UW Z <space> HH EH D L AY N Z <space> IH N <space> W AH N <space> TH AW Z AH N D <space> <space> N AY N <space> HH AH N D R AH D <space> AH N D <space> S IH K S T IY <space> S IH K S <space> AH N D <space> S IH K S T IY <space> S EH V AH N <space> AH L AO NG <space> W IH DH <space> S AH M <space> AH V <space> DH AH <space> M Y UW Z IH K <eos> Consecutive <space> found in phoneme: <sos> F OW K AH S IH NG <space> AA N <space> DH AH <space> IH V EH N T S <space> IH N <space> W AH N <space> TH AW Z AH N D <space> <space> N AY N <space> HH AH N D R AH D <space> AH N D <space> S IH K S T IY <space> EY T <space> AH N D <space> W AH N <space> TH AW Z AH N D <space> <space> N AY N <space> HH AH N D R AH D <space> AH N D <space> S IH K S T IY <space> N AY N <eos> Consecutive <space> found in phoneme: <sos> HH UW <space> D AY D <space> IH N <space> W AH N <space> TH AW Z AH N D <space> <space> N AY N <space> HH AH N D R AH D <space> AH N D <space> F AO R T IY <eos> Consecutive <space> found in phoneme: <sos> B IY IH NG <space> R AH S IY V D <space> IH N <space> W AH N <space> TH AW Z AH N D <space> <space> S EH V AH N <space> HH AH N D R AH D <space> AH N D <space> TH ER D IY <space> F AO R <space> AH N D <space> M AE N Y AH F AE K CH ER D <space> DH AE T <space> Y IH R <eos> Consecutive <space> found in phoneme: <sos> IH T <space> IH Z <space> B IH N <space> P R AH D UW S IH NG <space> L OW K AH L <space> EY L <space> S IH N S <space> AE T <space> L IY S T <space> W AH N <space> TH AW Z AH N D <space> <space> S IH K S <space> HH AH N D R AH D <space> AH N D <space> F AO R T IY <space> T UW <eos> Consecutive <space> found in phoneme: <sos> B AH T <space> IH N <space> F AE K T <space> IH N <space> W AH N <space> TH AW Z AH N D <space> <space> F AY V <space> HH AH N D R AH D <space> AH N D <space> T W EH N T IY <space> S EH V AH N <eos> Consecutive <space> found in phoneme: <sos> T UW <space> DH AH <space> T AY M <space> AH V <space> HH IH Z <space> D EH TH <space> IH N <space> W AH N <space> TH AW Z AH N D <space> <space> N AY N <space> HH AH N D R AH D <eos> Consecutive <space> found in phoneme: <sos> AY <space> HH AE V <space> B IH N <space> AH <space> V EH JH AH T EH R IY AH N <space> S IH N S <space> W AH N <space> TH AW Z AH N D <space> <space> N AY N <space> HH AH N D R AH D <space> AH N D <space> S EH V AH N T IY <space> N AY N <eos> Consecutive <space> found in phoneme: <sos> AH <space> V AH N IY SH AH N <space> HH UW <space> K EY M <space> HH IY R <space> B IH T W IY N <space> AH B AW T <space> W AH N <space> TH AW Z AH N D <space> <space> S EH V AH N <space> HH AH N D R AH D <space> AH N D <space> N AY N <space> AH N D <space> W AH N <space> TH AW Z AH N D <space> <space> S EH V AH N <space> HH AH N D R AH D <space> AH N D <space> T EH N <eos> Consecutive <space> found in phoneme: <sos> DH AH <space> AA R CH ER Z <space> F R AH M <space> W AH N <space> TH AW Z AH N D <space> <space> N AY N <space> HH AH N D R AH D <space> AH N D <space> EY T IY <space> T UW <space> DH AH <space> P R EH Z AH N T <space> D EY <eos> Consecutive <space> found in phoneme: <sos> AW ER <space> F ER S T <space> W AA Z <space> CH OW Z AH N <space> F AO R <space> HH ER <space> M AE JH AH S T IY <space> EH S <space> W EH D IH NG <space> T UW <space> P R IH N S <space> F IH L AH P <space> B AE K <space> IH N <space> W AH N <space> TH AW Z AH N D <space> <space> N AY N <space> HH AH N D R AH D <space> AH N D <space> F AO R T IY <space> S EH V AH N <eos> Consecutive <space> found in phoneme: <sos> AH N D <space> B AY <space> DH AH <space> T AY M <space> AH V <space> HH IH Z <space> D EH TH <space> IH N <space> W AH N <space> TH AW Z AH N D <space> <space> EY T <space> HH AH N D R AH D <space> AH N D <space> TH ER D IY <space> F AY V <eos> Consecutive <space> found in phoneme: <sos> IH T <space> W AA Z <space> M EY D <space> IH N <space> AH B AW T <space> W AH N <space> TH AW Z AH N D <space> <space> N AY N <space> HH AH N D R AH D <space> AH N D <space> TH ER D IY <space> F AY V <eos> Consecutive <space> found in phoneme: <sos> W EH N <space> DH AH <space> R EY L W EY <space> S T EY SH AH N <space> W AA Z <space> IH N AO G ER EY T IH D <space> HH IY R <space> IH N <space> AA G AH S T <space> W AH N <space> TH AW Z AH N D <space> <space> EY T <space> HH AH N D R AH D <space> AH N D <space> F IH F T IY <space> W AH N <eos> Consecutive <space> found in phoneme: <sos> P R IY S IH ZH AH N <space> S T R AY K <space> W AA Z <space> W AH T <space> DH AH <space> D AE M B AH S T ER Z <space> W ER <space> AH B AW T <space> IH N <space> W AH N <space> TH AW Z AH N D <space> <space> N AY N <space> HH AH N D R AH D <space> AH N D <space> F AO R T IY <space> TH R IY <space> AH N D <space> DH AE T <space> IH Z <space> V EH R IY <space> M AH CH <space> DH AH <space> S EY M <space> T AH D EY <eos> Consecutive <space> found in phoneme: <sos> W EH N <space> HH EH N R IY <space> N EH L T <space> T UW <space> M EY K <space> HH IH Z <space> W EH D IH NG <space> V AW Z <space> T UW <space> AE N <space> B OW L IH N <space> IH N <space> JH AE N Y UW EH R IY <space> W AH N <space> TH AW Z AH N D <space> <space> F AY V <space> HH AH N D R AH D <space> AH N D <space> TH ER D IY <space> TH R IY <eos> Consecutive <space> found in phoneme: <sos> IH N <space> AA K T OW B ER <space> W AH N <space> TH AW Z AH N D <space> <space> F AY V <space> HH AH N D R AH D <space> AH N D <space> TH ER D IY <space> S EH V AH N <eos> Consecutive <space> found in phoneme: <sos> B AH T <space> AY <space> TH IH NG K <space> IH T <space> IH Z <space> AH <space> W AH N D ER F AH L <space> IY V OW K EY SH AH N <space> AH V <space> DH AH <space> HH AA R D <space> T AY M Z <space> IH N <space> W AH N <space> TH AW Z AH N D <space> <space> N AY N <space> HH AH N D R AH D <space> AH N D <space> F AO R T IY <eos> Consecutive <space> found in phoneme: <sos> DH AE T <space> HH AE D <space> AH <space> AH K <space> T AA P <space> T EH N <space> S IH NG G AH L <space> IH N <space> W AH N <space> TH AW Z AH N D <space> <space> N AY N <space> HH AH N D R AH D <space> AH N D <space> N AY N T IY <space> F AY V <space> W IH DH <space> T ER N <space> AA N <eos> Consecutive <space> found in phoneme: <sos> F ER S T <space> P ER F AO R M D <space> IH N <space> W AH N <space> TH AW Z AH N D <space> <space> EY T <space> HH AH N D R AH D <space> AH N D <space> F AO R T IY <space> S IH K S <eos> Consecutive <space> found in phoneme: <sos> HH IY R <space> IH Z <space> AH <space> R IH L IY <space> G UH D <space> W EY <space> T UW <space> D IY L <space> W IH DH <space> DH EH M <space> IH N <space> DH AH <space> W AH N <space> TH AW Z AH N D <space> <space> N AY N <space> HH AH N D R AH D <space> AH N D <space> EY T IY <space> F AO R <space> G AA R D AH N <eos> Consecutive <space> found in phoneme: <sos> W IH CH <space> AY <space> AE M <space> G EH S IH NG <space> IH Z <space> S AH M TH IH NG <space> B IH T W IY N <space> W AH N <space> TH AW Z AH N D <space> <space> EY T <space> HH AH N D R AH D <space> AH N D <space> T W EH N T IY <eos> Consecutive <space> found in phoneme: <sos> HH IY <space> D AY D <space> IH N <space> W AH N <space> TH AW Z AH N D <space> <space> S EH V AH N <space> HH AH N D R AH D <space> AH N D <space> F IH F T IY <space> F AO R <eos> Consecutive <space> found in phoneme: <sos> DH EY <space> M EH R IY D <space> IH N <space> W AH N <space> TH AW Z AH N D <space> <space> EY T <space> HH AH N D R AH D <space> AH N D <space> N AY N T IY <space> TH R IY <eos> Consecutive <space> found in phoneme: <sos> AH N D <space> HH IY <space> R IH T AY R D <space> IH N <space> AH B AW T <space> W AH N <space> TH AW Z AH N D <space> <space> N AY N <space> HH AH N D R AH D <space> AH N D <space> N AY N T IY <space> W AH N <eos> Consecutive <space> found in phoneme: <sos> M AH Z UH R IY <space> IH N <space> W AH N <space> TH AW Z AH N D <space> <space> N AY N <space> HH AH N D R AH D <space> AH N D <space> F AO R T IY <space> S IH K S <eos> Consecutive <space> found in phoneme: <sos> G AY <space> G AA T <space> HH IH Z <space> N EY M <space> W EH N <space> HH IY <space> ER AY V D <space> AE T <space> DH AH <space> Z UW <space> AA N <space> G AY <space> F AO K S <space> D EY <space> IH N <space> W AH N <space> TH AW Z AH N D <space> <space> N AY N <space> HH AH N D R AH D <space> AH N D <space> F AO R T IY <space> S EH V AH N <eos> Consecutive <space> found in phoneme: <sos> K IH L ER T AH N <space> HH AW S <space> W AA Z <space> B IH L T <space> IH N <space> W AH N <space> TH AW Z AH N D <space> <space> S EH V AH N <space> HH AH N D R AH D <space> AH N D <space> S EH V AH N T IY <space> N AY N <space> F AO R <space> W AH N <space> AH V <space> D EH V AH N <space> EH S <space> OW L D AH S T <space> F AE M AH L IY Z <eos> Consecutive <space> found in phoneme: <sos> K AH M IH NG <space> AH P <space> T UW <space> W AH N <space> TH AW Z AH N D <space> <space> EY T <space> HH AH N D R AH D <space> AH N D <space> AH <space> B IH T <space> B IH AA N D <eos> Consecutive <space> found in phoneme: <sos> IH N <space> ER AW N D <space> AH B AW T <space> DH AH <space> Y IH R <space> W AH N <space> TH AW Z AH N D <space> <space> N AY N <space> HH AH N D R AH D <space> AH N D <space> T EH N <eos> Consecutive <space> found in phoneme: <sos> AH N D <space> HH IY <space> W AA Z <space> B AO R N <space> IH N <space> W AH N <space> TH AW Z AH N D <space> <space> EY T <space> HH AH N D R AH D <space> AH N D <space> EY T IY <space> EY T <eos> Consecutive <space> found in phoneme: <sos> IH N <space> DH AH <space> W IH N T ER <space> AH V <space> W AH N <space> TH AW Z AH N D <space> <space> N AY N <space> HH AH N D R AH D <space> AH N D <space> F IH F T IY <space> EY T <eos> Consecutive <space> found in phoneme: <sos> W IH CH <space> IH T <space> D IH D <space> AH N T IH L <space> W AH N <space> TH AW Z AH N D <space> <space> EY T <space> HH AH N D R AH D <space> AH N D <space> N AY N T IY <space> N AY N <eos> Consecutive <space> found in phoneme: <sos> DH AH <space> L UW S AH T EY N IY AH <space> W AH N <space> DH AH <space> B L UW <space> R AY B AH N D <space> IH N <space> W AH N <space> TH AW Z AH N D <space> <space> N AY N <space> HH AH N D R AH D <space> AH N D <space> S EH V AH N <eos> Consecutive <space> found in phoneme: <sos> P AH B L IH SH T <space> IH N <space> W AH N <space> TH AW Z AH N D <space> <space> N AY N <space> HH AH N D R AH D <space> AH N D <space> W AH N <space> AH N D <space> K AO L D <space> S IH M P L IY <space> P AA V ER T IY <eos> Consecutive <space> found in phoneme: <sos> AY <space> HH AE V <space> B IH N <space> AE N <space> AE K T R AH S <space> S IH N S <space> W AH N <space> TH AW Z AH N D <space> <space> N AY N <space> HH AH N D R AH D <space> AH N D <space> EY T IY <space> F AY V <eos> Consecutive <space> found in phoneme: <sos> IH T <space> W AA Z <space> K AE S T <space> IH N <space> L AO B ER OW <space> IH N <space> W AH N <space> TH AW Z AH N D <space> <space> N AY N <space> HH AH N D R AH D <space> AH N D <space> T W EH N T IY <space> F AO R <eos> Consecutive <space> found in phoneme: <sos> DH EH R <space> W AA Z <space> L EH S <space> F AO R AH S T <space> IH N <space> W AH N <space> TH AW Z AH N D <space> <space> N AY N <space> HH AH N D R AH D <space> DH AE N <space> DH EH R <space> HH AE D <space> B IH N <space> F AO R <space> T EH N <eos> Consecutive <space> found in phoneme: <sos> IH T <space> HH AE Z <space> B IH N <space> L EY D <space> AH P <space> S IH N S <space> IH T <space> B R OW K <space> D AW N <space> IH N <space> W AH N <space> TH AW Z AH N D <space> <space> N AY N <space> HH AH N D R AH D <space> AH N D <space> TH ER D IY <space> F AO R <eos> Consecutive <space> found in phoneme: <sos> AY <space> W AA Z <space> L AH K IY <space> IH N AH F <space> T UW <space> S IH T <space> W IH DH <space> AH <space> G AY <space> HH UW <space> W AH N <space> DH AH <space> F ER S T <space> B R IH T IH SH <space> R AE L IY <space> CH AE M P IY AH N SH IH P <space> IH N <space> W AH N <space> TH AW Z AH N D <space> <space> N AY N <space> HH AH N D R AH D <space> AH N D <space> F IH F T IY <space> EY T <eos> Consecutive <space> found in phoneme: <sos> IH T <space> M EY D <space> AH <space> R IY L <space> IH M P R EH SH AH N <space> AA N <space> K AE P T AH N <space> K UH K <space> W EH N <space> HH IY <space> K EY M <space> HH IY R <space> IH N <space> W AH N <space> TH AW Z AH N D <space> <space> S EH V AH N <space> HH AH N D R AH D <space> AH N D <space> S EH V AH N T IY <eos> Consecutive <space> found in phoneme: <sos> S OW <space> F R AH M <space> W AH N <space> TH AW Z AH N D <space> <space> N AY N <space> HH AH N D R AH D <space> AH N D <space> F AO R T IY <space> T UW <space> AA N W ER D Z <eos> Consecutive <space> found in phoneme: <sos> DH AE T <space> IH Z <space> W EH R <space> AY <space> S T AA R T AH D <space> M AY <space> B IY B IY S IY <space> K ER IH R <space> B AE K <space> IH N <space> W AH N <space> TH AW Z AH N D <space> <space> N AY N <space> HH AH N D R AH D <space> AH N D <space> N AY N T IY <space> TH R IY <eos> Consecutive <space> found in phoneme: <sos> IH T <space> F ER S T <space> AH P IH R D <space> AA N <space> B IY B IY S IY <space> T UW <space> IH N <space> W AH N <space> TH AW Z AH N D <space> <space> N AY N <space> HH AH N D R AH D <space> AH N D <space> S EH V AH N T IY <space> EY T <eos> Consecutive <space> found in phoneme: <sos> R AY T <space> TH R UW <space> T UW <space> AH B AW T <space> W AH N <space> TH AW Z AH N D <space> <space> EY T <space> HH AH N D R AH D <space> AH N D <space> F IH F T IY <eos> Consecutive <space> found in phoneme: <sos> DH AH <space> ER IH JH AH N AH L <space> W AH N <space> W AA Z <space> N AA K T <space> D AW N <space> T UW <space> B IY <space> R IY P L EY S T <space> B AY <space> DH IH S <space> W AH N <space> IH N <space> W AH N <space> TH AW Z AH N D <space> <space> EY T <space> HH AH N D R AH D <space> AH N D <space> EY T IY <space> S EH V AH N <space> AH N D <space> W AH T <space> IH Z <space> IY V IH N <space> M AO R <eos> Consecutive <space> found in phoneme: <sos> B IH K AO Z <space> W EH N <space> SH IY <space> D AY D <space> IH N <space> W AH N <space> TH AW Z AH N D <space> <space> EY T <space> HH AH N D R AH D <space> AH N D <space> F AO R T IY N <eos> Consecutive <space> found in phoneme: <sos> HH UW <space> W AA Z <space> B AO R N <space> IH N <space> W AH N <space> TH AW Z AH N D <space> <space> N AY N <space> HH AH N D R AH D <space> AH N D <space> F AO R T IY <space> EY T <eos> Consecutive <space> found in phoneme: <sos> AE N <space> OW L D <space> S IH N AH G AO G <space> D EY T IH NG <space> F R AH M <space> W AH N <space> TH AW Z AH N D <space> <space> TH R IY <space> HH AH N D R AH D <space> AH N D <space> TH ER D IY <space> EY T <eos> Consecutive <space> found in phoneme: <sos> DH AH <space> R IY L <space> L AE S T <space> IH N V EY ZH AH N <space> AE K CH AH W AH L IY <space> K EY M <space> HH IY R <space> IH N <space> W AH N <space> TH AW Z AH N D <space> <space> F AY V <space> HH AH N D R AH D <space> AH N D <space> F AO R T IY <space> F AY V <eos> Consecutive <space> found in phoneme: <sos> P AH L IH T AH K AH L <space> P R EH SH ER <space> L EH D <space> T UW <space> DH AH <space> P AE S AH JH <space> AH V <space> DH AH <space> W AH N <space> TH AW Z AH N D <space> <space> N AY N <space> HH AH N D R AH D <space> AH N D <space> F AY V <space> EY L IY AH N Z <space> AE K T <eos> Consecutive <space> found in phoneme: <sos> AH N D <space> S EH T <space> AH G EH N S T <space> DH AH <space> B AE K D R AA P <space> AH V <space> DH AH <space> W AH N <space> TH AW Z AH N D <space> <space> N AY N <space> HH AH N D R AH D <space> AH N D <space> EY T IY <space> F AO R <space> M AY N ER Z <space> S T R AY K <eos> Consecutive <space> found in phoneme: <sos> IH N D IY D <space> B IH T W IY N <space> W AH N <space> TH AW Z AH N D <space> <space> N AY N <space> HH AH N D R AH D <space> AH N D <space> F AO R T IY <space> F AY V <space> AH N D <space> W AH N <space> TH AW Z AH N D <space> <space> N AY N <space> HH AH N D R AH D <space> AH N D <space> S IH K S T IY <space> F AY V <space> IH T <space> IH Z <space> F EH R <space> T UW <space> S EY <space> DH AE T <space> HH IY <space> K AH N T R IH B Y UW T IH D <space> M AO R <eos> Consecutive <space> found in phoneme: <sos> B IH L D IH NG <space> W ER K <space> S T AA R T AH D <space> IH N <space> W AH N <space> TH AW Z AH N D <space> <space> S EH V AH N <space> HH AH N D R AH D <space> AH N D <space> TH ER D IY <space> N AY N <eos> Consecutive <space> found in phoneme: <sos> DH AH <space> HH AW S <space> W AA Z <space> B IH L T <space> IH N <space> W AH N <space> TH AW Z AH N D <space> <space> S IH K S <space> HH AH N D R AH D <space> AH N D <space> S IH K S <eos> Consecutive <space> found in phoneme: <sos> AH <space> P AH B <space> CH EY N <space> EH S <space> B IH N <space> K R IH T AH S AY Z D <space> F AO R <space> DH IH S <space> D AH B AH L <space> D OW N AH T <space> B ER G ER <space> W IH DH <space> W AH N <space> TH AW Z AH N D <space> <space> N AY N <space> HH AH N D R AH D <space> AH N D <space> N AY N T IY <space> S IH K S <eos> Consecutive <space> found in phoneme: <sos> AH <space> K AA P IY <space> AH V <space> AH <space> M Y UW T ER S AY K AH N G <space> M AE G AH Z IY N <space> F R AH M <space> W AH N <space> TH AW Z AH N D <space> <space> N AY N <space> HH AH N D R AH D <space> AH N D <space> S EH V AH N T IY <space> S IH K S <eos> Consecutive <space> found in phoneme: <sos> M AO R <space> R IY S AH N T L IY <space> IH N <space> W AH N <space> TH AW Z AH N D <space> <space> N AY N <space> HH AH N D R AH D <space> AH N D <space> F AO R T IY <space> N AY N <eos> Consecutive <space> found in phoneme: <sos> EH V ER <space> S IH N S <space> IH T S <space> F ER S T <space> AH P IH R AH N S <space> IH N <space> AH <space> B L AE K <space> AH N D <space> W AY T <space> S K R AE P Y AA R D <space> IH N <space> W AH N <space> TH AW Z AH N D <space> <space> N AY N <space> HH AH N D R AH D <space> AH N D <space> S IH K S T IY <space> TH R IY <eos> Consecutive <space> found in phoneme: <sos> AH N D <space> IH N <space> D IH S EH M B ER <space> W AH N <space> TH AW Z AH N D <space> <space> F AY V <space> HH AH N D R AH D <space> AH N D <space> T W EH N T IY <space> S EH V AH N <eos> Consecutive <space> found in phoneme: <sos> OW V ER <space> S IH K S <space> D EY Z <space> IH N <space> W AH N <space> TH AW Z AH N D <space> <space> N AY N <space> HH AH N D R AH D <space> AH N D <space> F IH F T IY <space> EY T <eos> Consecutive <space> found in phoneme: <sos> S AH M <space> W AH N <space> TH AW Z AH N D <space> <space> EY T <space> HH AH N D R AH D <space> Y IH R Z <space> AH G OW <eos> Consecutive <space> found in phoneme: <sos> AH N D <space> DH IH S <space> M EH G AH F OW N <space> D EY T S <space> F R AH M <space> W AH N <space> TH AW Z AH N D <space> <space> N AY N <space> HH AH N D R AH D <space> AH N D <space> S IH K S T IY <space> TH R IY <eos> Consecutive <space> found in phoneme: <sos> T UW <space> TH AW Z AH N D <space> AH N D <space> T EH N <space> W IY <space> IH N HH EH R AH T IH D <space> DH AH <space> L OW AH S T <space> L EH V AH L <space> AH V <space> B IH L D IH NG Z <space> S IH N S <space> W AH N <space> TH AW Z AH N D <space> <space> N AY N <space> HH AH N D R AH D <space> AH N D <space> T W EH N T IY <space> TH R IY <eos> Consecutive <space> found in phoneme: <sos> DH AH <space> F ER S T <space> R EH F ER AH N S <space> T UW <space> DH AH <space> B AO R AH S T OW N <space> IH Z <space> F R AH M <space> W AH N <space> TH AW Z AH N D <space> <space> S EH V AH N <space> HH AH N D R AH D <space> AH N D <space> T W EH N T IY <space> TH R IY <eos> Consecutive <space> found in phoneme: <sos> HH UW <space> W AH N <space> AH <space> B EH S T <space> AE K T R AH S <space> AO S K ER <space> IH N <space> W AH N <space> TH AW Z AH N D <space> <space> N AY N <space> HH AH N D R AH D <space> AH N D <space> EY T IY <space> EY T <space> F AO R <space> DH AH <space> F IH L M <space> M UW N S T R AH K <eos> Consecutive <space> found in phoneme: <sos> W EY <space> B AE K <space> IH N <space> W AH N <space> TH AW Z AH N D <space> <space> N AY N <space> HH AH N D R AH D <space> AH N D <space> N AY N T IY <space> W AH N <eos> Consecutive <space> found in phoneme: <sos> HH UW <space> W AA Z <space> IH M P L IH K EY T IH D <space> IH N <space> DH AH <space> G AH N P AW D ER <space> P L AA T <space> IH N <space> W AH N <space> TH AW Z AH N D <space> <space> S IH K S <space> HH AH N D R AH D <space> AH N D <space> F AY V <eos> Consecutive <space> found in phoneme: <sos> DH AH <space> S T AA R <space> AH V <space> DH AH <space> S T AA R <space> T R EH K <space> S IH R IY Z <space> AH N D <space> F IH L M Z <space> B IY M D <space> D AW N <space> T UW <space> DH AH <space> W OW G AH N <space> S T UW D IY OW <space> IH N <space> W AH N <space> TH AW Z AH N D <space> <space> N AY N <space> HH AH N D R AH D <space> AH N D <space> EY T IY <space> N AY N <eos> Consecutive <space> found in phoneme: <sos> W EH N <space> B EH T IY <space> D EY V AH S <space> K EY M <space> AA N <space> DH AH <space> SH OW <space> B AE K <space> IH N <space> W AH N <space> TH AW Z AH N D <space> <space> N AY N <space> HH AH N D R AH D <space> AH N D <space> EY T IY <space> S EH V AH N <eos> Consecutive <space> found in phoneme: <sos> IH N <space> W IH CH <space> AY <space> F L AY <space> AH <space> W AH N <space> TH AW Z AH N D <space> <space> N AY N <space> HH AH N D R AH D <space> AH N D <space> T W EH N T IY <space> S IH K S <space> S T IH R M AH N <eos> Consecutive <space> found in phoneme: <sos> IH N <space> W AH N <space> TH AW Z AH N D <space> <space> EY T <space> HH AH N D R AH D <space> AH N D <space> T W EH N T IY <space> EY T <space> DH AE T <space> DH AH <space> F ER S T <space> F R EH N CH <space> AH N Y AH N <space> S EH L ER <space> D IH S AY D IH D <space> T UW <space> T R AY <space> HH IH Z <space> L AH K <space> AH N D <space> K R AO S <space> DH AH <eos> Consecutive <space> found in phoneme: <sos> AH K AO R D IH NG <space> T UW <space> DH AH <space> W AH N <space> TH AW Z AH N D <space> <space> N AY N <space> HH AH N D R AH D <space> AH N D <space> IH L EH V AH N <space> S EH N S AH S <eos> Consecutive <space> found in phoneme: <sos> IH N <space> W AH N <space> TH AW Z AH N D <space> <space> EY T <space> HH AH N D R AH D <space> AH N D <space> T W EH N T IY <space> EY T <space> DH AE T <space> DH AH <space> F ER S T <space> F R EH N CH <space> AH N Y AH N <space> S EH L ER <space> D IH S AY D IH D <space> T UW <space> T R AY <space> HH IH Z <space> L AH K <space> AH N D <space> K R AO S <space> DH AH <eos> Consecutive <space> found in phoneme: <sos> IH T <space> W AA Z <space> AE T <space> DH IH S <space> V EH R IY <space> S P AA T <space> IH N <space> W AH N <space> TH AW Z AH N D <space> <space> S IH K S <space> HH AH N D R AH D <space> AH N D <space> F IH F T IY <space> W AH N <space> DH AE T <space> CH AA R L Z <space> IY <space> IH S K EY P T <space> K AE P CH ER <space> B AY <eos> Consecutive <space> found in phoneme: <sos> W IY <space> HH AE V <space> N AA T <space> HH AE D <space> AE N <space> AA R M IY <space> S IH N S <space> W AH N <space> TH AW Z AH N D <space> <space> N AY N <space> HH AH N D R AH D <space> AH N D <space> F AO R T IY <space> EY T <eos> Consecutive <space> found in phoneme: <sos> S OW <space> DH EY <space> JH OY N D <space> F AO R S IH Z <space> W IH DH <space> DH AH <space> AO S T R IY AH N Z <space> AH N D <space> B AY <space> W AH N <space> TH AW Z AH N D <space> <space> S EH V AH N <space> HH AH N D R AH D <space> AH N D <space> S IH K S <eos> Consecutive <space> found in phoneme: <sos> S UW N <space> AE F T ER <space> DH AH <space> N AA T S IY Z <space> K EY M <space> T UW <space> P AW ER <space> IH N <space> W AH N <space> TH AW Z AH N D <space> <space> N AY N <space> HH AH N D R AH D <space> AH N D <space> TH ER D IY <space> TH R IY <eos> Consecutive <space> found in phoneme: <sos> AH N D <space> SH IY <space> HH AE D <space> DH IH S <space> AH F EH R <space> W IH DH <space> EH D W ER D <space> B IH T W IY N <space> W AH N <space> TH AW Z AH N D <space> <space> N AY N <space> HH AH N D R AH D <space> AH N D <space> EY T IY N <space> AH N D <space> W AH N <space> TH AW Z AH N D <space> <space> N AY N <space> HH AH N D R AH D <space> AH N D <space> T W EH N T IY <space> TH R IY <eos> Consecutive <space> found in phoneme: <sos> B IH K AO Z <space> IH N <space> W AH N <space> TH AW Z AH N D <space> <space> N AY N <space> HH AH N D R AH D <space> AH N D <space> EY T IY <space> W AH N <space> W IY <space> W ER <space> R IH L IY <space> AE T <space> DH AH <space> T IH P IH NG <space> P OY N T <space> B IH T W IY N <space> DH AH <space> T ER B OW <eos> Consecutive <space> found in phoneme: <sos> AY <space> S IY <space> IH T <space> W AA Z <space> R IH T AH N <space> IH N <space> JH UW N <space> W AH N <space> TH AW Z AH N D <space> <space> N AY N <space> HH AH N D R AH D <space> AH N D <space> S IH K S T IY <space> S EH V AH N <eos> Consecutive <space> found in phoneme: <sos> IH T <space> IH Z <space> B EY S T <space> AA N <space> HH IH Z <space> S EH L F <space> P AO R T R AH T <space> AH V <space> W AH N <space> TH AW Z AH N D <space> <space> S EH V AH N <space> HH AH N D R AH D <space> AH N D <space> F AO R T IY <space> F AY V <eos> Consecutive <space> found in phoneme: <sos> DH IH S <space> S AY T <space> W AA Z <space> AE N <space> R AE F <space> EH R <space> B EY S <space> F R AH M <space> W AH N <space> TH AW Z AH N D <space> <space> N AY N <space> HH AH N D R AH D <space> AH N D <space> EY T IY N <space> AH N T IH L <space> W AH N <space> TH AW Z AH N D <space> <space> N AY N <space> HH AH N D R AH D <space> AH N D <space> S IH K S T IY <space> W AH N <space> AH N D <space> N AW <eos> Consecutive <space> found in phoneme: <sos> IH N <space> DH AH <space> S AH M ER <space> AH V <space> W AH N <space> TH AW Z AH N D <space> <space> N AY N <space> HH AH N D R AH D <space> AH N D <space> F AO R T IY <eos> Consecutive <space> found in phoneme: <sos> AE T <space> DH AH <space> B AE T AH L <space> AH V <space> K W IH B ER OW N <space> B EY <space> IH N <space> W AH N <space> TH AW Z AH N D <space> <space> S EH V AH N <space> HH AH N D R AH D <space> AH N D <space> F IH F T IY <space> N AY N <eos> Consecutive <space> found in phoneme: <sos> IH N <space> DH AH <space> S AH M ER <space> AH V <space> W AH N <space> TH AW Z AH N D <space> <space> N AY N <space> HH AH N D R AH D <space> AH N D <space> F AO R T IY <eos> Consecutive <space> found in phoneme: <sos> HH AE V IH NG <space> B IH N <space> K AH M P L IY T AH D <space> IH N <space> W AH N <space> TH AW Z AH N D <space> <space> N AY N <space> HH AH N D R AH D <space> AH N D <space> S IH K S T IY <eos> Consecutive <space> found in phoneme: <sos> W IY <space> S AH D AH N L IY <space> EH M B AA R K T <space> AA N <space> AH <space> HH EH D L AO NG <space> R AH SH <space> T UW <space> G EH T <space> R IH D <space> AH V <space> S T IY M <space> F R AH M <space> W AH N <space> TH AW Z AH N D <space> <space> N AY N <space> HH AH N D R AH D <space> AH N D <space> S IH K S T IY <space> TH R IY <space> T UW <eos> Consecutive <space> found in phoneme: <sos> R AY T <space> AH P <space> T UW <space> W AH N <space> TH AW Z AH N D <space> <space> EY T <space> HH AH N D R AH D <space> AH N D <space> T W EH N T IY <space> S IH K S <eos> Consecutive <space> found in phoneme: <sos> HH IY <space> P AE T AH N T AH D <space> DH AH <space> S AH L IH N D R IH K AH L <space> S L AY D <space> R UW L <space> IH N <space> W AH N <space> TH AW Z AH N D <space> <space> EY T <space> HH AH N D R AH D <space> AH N D <space> S EH V AH N T IY <space> EY T <eos> Consecutive <space> found in phoneme: <sos> B IH K AO Z <space> HH IY <space> K EY M <space> T UW <space> P AW ER <space> IH N <space> AH <space> M IH L AH T EH R IY <space> K UW <space> IH N <space> W AH N <space> TH AW Z AH N D <space> <space> N AY N <space> HH AH N D R AH D <space> AH N D <space> S EH V AH N T IY <space> S EH V AH N <eos> Consecutive <space> found in phoneme: <sos> DH AH <space> L AE S T <space> AA B Z ER V EY SH AH N <space> T UW <space> B IY <space> D AH N <space> HH IY R <space> W AA Z <space> M EY D <space> IH N <space> W AH N <space> TH AW Z AH N D <space> <space> N AY N <space> HH AH N D R AH D <space> AH N D <space> F IH F T IY <space> F AO R <eos> Consecutive <space> found in phoneme: <sos> IH N <space> D IH S EH M B ER <space> W AH N <space> TH AW Z AH N D <space> <space> N AY N <space> HH AH N D R AH D <space> AH N D <space> F IH F T IY <space> W AH N <eos> Consecutive <space> found in phoneme: <sos> S IH N S <space> K AA M IH K <space> R IH L IY F <space> S T AA R T AH D <space> IH N <space> W AH N <space> TH AW Z AH N D <space> <space> N AY N <space> HH AH N D R AH D <space> AH N D <space> EY T IY <space> F AY V <eos> Consecutive <space> found in phoneme: <sos> HH UW <space> W AA Z <space> B AO R N <space> IH N <space> W AH N <space> TH AW Z AH N D <space> <space> N AY N <space> HH AH N D R AH D <space> AH N D <space> IH L EH V AH N <eos> Consecutive <space> found in phoneme: <sos> AH N D <space> AE Z <space> AH <space> CH AY L D <space> IH N <space> W AH N <space> TH AW Z AH N D <space> <space> N AY N <space> HH AH N D R AH D <space> AH N D <space> S IH K S T IY <space> S EH V AH N <eos> Consecutive <space> found in phoneme: <sos> HH IY <space> W AA Z <space> W ER K IH NG <space> AW T <space> IH N <space> IY S T <space> AE F R AH K AH <space> IH N <space> W AH N <space> TH AW Z AH N D <space> <space> N AY N <space> HH AH N D R AH D <space> AH N D <space> S EH V AH N T IY <space> TH R IY <space> W EH N <space> HH IY <space> K EY M <space> AH K R AO S <space> AH <space> F AH S IH L AH S T <space> B OW N <space> DH AE T <eos> Consecutive <space> found in phoneme: <sos> IH T <space> S T AA R T AH D <space> IH N <space> W AH N <space> TH AW Z AH N D <space> <space> N AY N <space> HH AH N D R AH D <space> AH N D <space> N AY N T IY <space> F AO R <eos> Consecutive <space> found in phoneme: <sos> JH AH S T <space> AH N AH DH ER <space> T UW <space> TH AW Z AH N D <space> <space> TH R IY <space> HH AH N D R AH D <space> T UW <space> G OW <eos> Consecutive <space> found in phoneme: <sos> DH IH S <space> B UH K <space> W AA Z <space> P AH B L IH SH T <space> IH N <space> W AH N <space> TH AW Z AH N D <space> <space> EY T <space> HH AH N D R AH D <space> AH N D <space> F IH F T IY <space> T UW <eos> Consecutive <space> found in phoneme: <sos> IH N T R AH D UW S T <space> IH N T UW <space> S ER V AH S <space> IH N <space> W AH N <space> TH AW Z AH N D <space> <space> N AY N <space> HH AH N D R AH D <space> AH N D <space> S IH K S T IY <space> N AY N <eos> Consecutive <space> found in phoneme: <sos> R AY T <space> AH P <space> AH N T IH L <space> W AH N <space> TH AW Z AH N D <space> <space> N AY N <space> HH AH N D R AH D <space> AH N D <space> N AY N T IY <space> EY T <eos> Consecutive <space> found in phoneme: <sos> DH AE T <space> W AA Z <space> IH N <space> W AH N <space> TH AW Z AH N D <space> <space> N AY N <space> HH AH N D R AH D <space> AH N D <space> S EH V AH N T IY <space> TH R IY <eos> Consecutive <space> found in phoneme: <sos> AH N D <space> DH AE T <space> T UH K <space> P L EY S <space> IH N <space> W AH N <space> TH AW Z AH N D <space> <space> EY T <space> HH AH N D R AH D <space> AH N D <space> TH ER D IY <space> F AY V <eos> Consecutive <space> found in phoneme: <sos> W EH N <space> IH T <space> OW P AH N D <space> IH N <space> W AH N <space> TH AW Z AH N D <space> <space> EY T <space> HH AH N D R AH D <space> AH N D <space> T W EH N T IY <space> S IH K S <eos> Consecutive <space> found in phoneme: <sos> W IY <space> G AA T <space> IH T <space> IH N <space> W AH N <space> TH AW Z AH N D <space> <space> EY T <space> HH AH N D R AH D <space> AH N D <space> F AO R T IY <space> EY T <eos> Consecutive <space> found in phoneme: <sos> DH IY Z <space> W ER <space> N OW N <space> AE Z <space> DH AH <space> AO S T EH R IH T IY <space> G EY M Z <space> IH N <space> W AH N <space> TH AW Z AH N D <space> <space> N AY N <space> HH AH N D R AH D <space> AH N D <space> F AO R T IY <space> EY T <eos> Consecutive <space> found in phoneme: <sos> DH IY Z <space> W ER <space> N OW N <space> AE Z <space> DH AH <space> AO S T EH R IH T IY <space> G EY M Z <space> IH N <space> W AH N <space> TH AW Z AH N D <space> <space> N AY N <space> HH AH N D R AH D <space> AH N D <space> F AO R T IY <space> EY T <eos> Consecutive <space> found in phoneme: <sos> S OW <space> AY <space> R EH K AH N <space> AH B AW T <space> W AH N <space> TH AW Z AH N D <space> <space> S EH V AH N <space> HH AH N D R AH D <space> AH N D <space> N AY N T IY <eos> Consecutive <space> found in phoneme: <sos> P AH B L IH SH T <space> IH N <space> W AH N <space> TH AW Z AH N D <space> <space> N AY N <space> HH AH N D R AH D <space> AH N D <space> EY T IY <space> F AO R <eos> Consecutive <space> found in phoneme: <sos> AY <space> TH IH NG K <space> IH N <space> JH ER M AH N IY <space> IH T <space> S T AA R T AH D <space> W IH DH <space> DH AH <space> R EH V AH L UW SH AH N <space> IH N <space> W AH N <space> TH AW Z AH N D <space> <space> EY T <space> HH AH N D R AH D <space> AH N D <space> F AO R T IY <space> EY T <eos> Consecutive <space> found in phoneme: <sos> AH N D <space> DH AE T <space> W AA Z <space> AH B AW T <space> W AH N <space> TH AW Z AH N D <space> <space> EY T <space> HH AH N D R AH D <space> AH N D <space> F AO R T IY <space> T UW <eos>
In [ ]:
def count_consecutive_special_tokens(sentence_sequences, phoneme_sequences):
special_tokens = ['<eos>', '<sos>', '<space>']
count = 0
for seq in sentence_sequences:
for token in special_tokens:
if f"{token} {token}" in seq:
count += 1
for seq in phoneme_sequences:
for token in special_tokens:
if f"{token} {token}" in ' '.join(seq):
count += 1
return count
# Example usage:
count = count_consecutive_special_tokens(df['sentence_with_tokens'], df['phonemes'])
print(f"Number of sentences with consecutive special tokens: {count}")
Number of sentences with consecutive special tokens: 114
In [ ]:
import matplotlib.pyplot as plt
import seaborn as sns
# Count consecutive special tokens
count = count_consecutive_special_tokens(df['sentence_with_tokens'], df['phonemes'])
# Data for visualization
labels = ['Sentences with Consecutive Tokens', 'Total Sentences']
values = [count, len(df) - count]
percentages = [value / len(df) * 100 for value in values]
# Colors for the visualizations
colors = ['#3498DB', '#E74C3C']
# Visualization
plt.figure(figsize=(10, 6))
sns.set_context("talk", font_scale=0.8)
bars = sns.barplot(x=labels, y=values, palette=colors)
# Annotate the bars with the count value and percentage
for index, (value, percentage) in enumerate(zip(values, percentages)):
plt.text(index, value + (0.02 * max(values)),
f"{value} ({percentage:.1f}%)",
ha='center', va='center', fontweight='bold', fontsize=14)
# Set title and labels
plt.title('Sentences with Consecutive Special Tokens vs. Total Sentences', fontsize=15)
plt.ylabel('Number of Sentences', fontsize=13)
plt.xticks(fontsize=12)
# Ensure the text fits within the figure bounds
plt.tight_layout()
# Show the plot
plt.show()
In [ ]:
df.head()
Out[ ]:
| sentence | word_count | original_sentence | word_count_after_conversion | changed | phonemes | num_spaces | phonemes_str | sentence_with_tokens | phoneme_count | tokenized_sentence | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | when you are cooking chips at home | 7 | WHEN YOU'RE COOKING CHIPS AT HOME | 6 | False | [<sos>, W, EH, N, <space>, Y, UW, <space>, AA,... | 6 | <sos> W EH N <space> Y UW <space> AA R <space>... | <sos> when <space> you <space> are <space> coo... | 29 | <sos> when <space> you <space> are <space> coo... |
| 1 | the traditional chip pan often stays on the shelf | 9 | THE TRADITIONAL CHIP PAN OFTEN STAYS ON THE SHELF | 9 | False | [<sos>, DH, AH, <space>, T, R, AH, D, IH, SH, ... | 8 | <sos> DH AH <space> T R AH D IH SH AH N AH L <... | <sos> the <space> traditional <space> chip <sp... | 44 | <sos> the <space> traditional <space> chip <sp... |
| 2 | through what they call a knife block | 7 | THROUGH WHAT THEY CALL A KNIFE BLOCK | 7 | False | [<sos>, TH, R, UW, <space>, W, AH, T, <space>,... | 6 | <sos> TH R UW <space> W AH T <space> DH EY <sp... | <sos> through <space> what <space> they <space... | 27 | <sos> through <space> what <space> they <space... |
| 3 | which involves firing a potato down a pipe | 8 | WHICH INVOLVES FIRING A POTATO DOWN A PIPE | 8 | False | [<sos>, W, IH, CH, <space>, IH, N, V, AA, L, V... | 7 | <sos> W IH CH <space> IH N V AA L V Z <space> ... | <sos> which <space> involves <space> firing <s... | 38 | <sos> which <space> involves <space> firing <s... |
| 4 | apart from the golden colour and the delicious... | 9 | APART FROM THE GOLDEN COLOUR AND THE DELICIOUS... | 9 | False | [<sos>, AH, P, AA, R, T, <space>, F, R, AH, M,... | 8 | <sos> AH P AA R T <space> F R AH M <space> DH ... | <sos> apart <space> from <space> the <space> g... | 49 | <sos> apart <space> from <space> the <space> g... |
In [ ]:
def has_consecutive_special_tokens(seq):
special_tokens = ['<eos>', '<sos>', '<space>']
for token in special_tokens:
if f"{token} {token}" in seq:
return True
return False
# Create a mask that is True for rows without consecutive special tokens
mask = ~df['sentence_with_tokens'].apply(has_consecutive_special_tokens) & ~df['phonemes'].apply(lambda x: has_consecutive_special_tokens(' '.join(x)))
# Index df with the mask
df = df[mask]
print(df)
sentence word_count \
0 when you are cooking chips at home 7
1 the traditional chip pan often stays on the shelf 9
2 through what they call a knife block 7
3 which involves firing a potato down a pipe 8
4 apart from the golden colour and the delicious... 9
... ... ...
45834 when he is not having his seizures when he is ... 13
45835 she wants attention from both of us and 8
45836 as much as we try to give it to her 10
45837 they so deserve 3
45838 thank you enough for what you have done 8
original_sentence \
0 WHEN YOU'RE COOKING CHIPS AT HOME
1 THE TRADITIONAL CHIP PAN OFTEN STAYS ON THE SHELF
2 THROUGH WHAT THEY CALL A KNIFE BLOCK
3 WHICH INVOLVES FIRING A POTATO DOWN A PIPE
4 APART FROM THE GOLDEN COLOUR AND THE DELICIOUS...
... ...
45834 WHEN HE'S NOT HAVING HIS SEIZURES WHEN HE'S NO...
45835 SHE WANTS ATTENTION FROM BOTH OF US AND
45836 AS MUCH AS WE TRY TO GIVE IT TO HER
45837 THEY SO DESERVE
45838 THANK YOU ENOUGH FOR WHAT YOU'VE DONE
word_count_after_conversion changed \
0 6 False
1 9 False
2 7 False
3 8 False
4 9 False
... ... ...
45834 11 False
45835 8 False
45836 10 False
45837 3 False
45838 7 False
phonemes num_spaces \
0 [<sos>, W, EH, N, <space>, Y, UW, <space>, AA,... 6
1 [<sos>, DH, AH, <space>, T, R, AH, D, IH, SH, ... 8
2 [<sos>, TH, R, UW, <space>, W, AH, T, <space>,... 6
3 [<sos>, W, IH, CH, <space>, IH, N, V, AA, L, V... 7
4 [<sos>, AH, P, AA, R, T, <space>, F, R, AH, M,... 8
... ... ...
45834 [<sos>, W, EH, N, <space>, HH, IY, <space>, IH... 12
45835 [<sos>, SH, IY, <space>, W, AA, N, T, S, <spac... 7
45836 [<sos>, AE, Z, <space>, M, AH, CH, <space>, AE... 9
45837 [<sos>, DH, EY, <space>, S, OW, <space>, D, IH... 2
45838 [<sos>, TH, AE, NG, K, <space>, Y, UW, <space>... 7
phonemes_str \
0 <sos> W EH N <space> Y UW <space> AA R <space>...
1 <sos> DH AH <space> T R AH D IH SH AH N AH L <...
2 <sos> TH R UW <space> W AH T <space> DH EY <sp...
3 <sos> W IH CH <space> IH N V AA L V Z <space> ...
4 <sos> AH P AA R T <space> F R AH M <space> DH ...
... ...
45834 <sos> W EH N <space> HH IY <space> IH Z <space...
45835 <sos> SH IY <space> W AA N T S <space> AH T EH...
45836 <sos> AE Z <space> M AH CH <space> AE Z <space...
45837 <sos> DH EY <space> S OW <space> D IH Z ER V <...
45838 <sos> TH AE NG K <space> Y UW <space> IH N AH ...
sentence_with_tokens phoneme_count \
0 <sos> when <space> you <space> are <space> coo... 29
1 <sos> the <space> traditional <space> chip <sp... 44
2 <sos> through <space> what <space> they <space... 27
3 <sos> which <space> involves <space> firing <s... 38
4 <sos> apart <space> from <space> the <space> g... 49
... ... ...
45834 <sos> when <space> he <space> is <space> not <... 54
45835 <sos> she <space> wants <space> attention <spa... 37
45836 <sos> as <space> much <space> as <space> we <s... 34
45837 <sos> they <space> so <space> deserve <eos> 13
45838 <sos> thank <space> you <space> enough <space>... 33
tokenized_sentence
0 <sos> when <space> you <space> are <space> coo...
1 <sos> the <space> traditional <space> chip <sp...
2 <sos> through <space> what <space> they <space...
3 <sos> which <space> involves <space> firing <s...
4 <sos> apart <space> from <space> the <space> g...
... ...
45834 <sos> when <space> he <space> is <space> not <...
45835 <sos> she <space> wants <space> attention <spa...
45836 <sos> as <space> much <space> as <space> we <s...
45837 <sos> they <space> so <space> deserve <eos>
45838 <sos> thank <space> you <space> enough <space>...
[45700 rows x 11 columns]
In [ ]:
space_sentences = df[df['phonemes'].apply(lambda x: ' ' in x)]
print(space_sentences[['sentence', 'phonemes']])
Empty DataFrame Columns: [sentence, phonemes] Index: []
In [ ]:
import sys
sys.path.append('/content/drive/MyDrive/Dissertation')
from label_vectorization import SentenceVectorizer
In [ ]:
# Get the 10 most common phonemes
most_common_phonemes = phoneme_freq.most_common(10)
# Print the 10 most common phonemes
print("10 Most Common Phonemes:")
for phoneme, count in most_common_phonemes:
print(f"{phoneme}: {count}")
# Set up the visualization with a refined style and context
sns.set_style("whitegrid")
sns.set_context("talk")
plt.figure(figsize=(15, 8))
# Extract phoneme names and their counts
phonemes = [phoneme for phoneme, _ in most_common_phonemes]
counts = [count for _, count in most_common_phonemes]
# Use a sophisticated color palette (deep muted colors)
palette = sns.color_palette("viridis", n_colors=len(most_common_phonemes))
# Plot the phoneme frequencies
bars = sns.barplot(x=phonemes, y=counts, palette=palette)
# Add annotations to each bar
for index, value in enumerate(counts):
bars.text(index, value + max(counts)*0.02, f'{value} ({value/sum(counts)*100:.1f}%)', color='black', ha="center", va="bottom", fontsize=12)
# Set title, xlabel, ylabel and adjust font sizes
plt.title('Top 10 Phoneme Frequencies', fontsize=22, fontweight='bold', pad=20)
plt.xlabel('Phoneme', fontsize=18)
plt.ylabel('Frequency', fontsize=18)
plt.xticks(fontsize=16)
plt.yticks(fontsize=16)
# Ensure the plot layout is organized
plt.tight_layout()
# Show the plot
plt.show()
10 Most Common Phonemes: <space>: 299529 AH: 111029 T: 91599 N: 77726 IH: 75183 R: 52083 S: 50329 D: 47510 <sos>: 45814 <eos>: 45814
In [ ]:
# Concatenate all lists of phonemes and create a Counter object
all_phonemes = [phoneme for sublist in df['phonemes'] for phoneme in sublist]
phoneme_freq = Counter(all_phonemes)
# Get all unique phonemes
unique_phonemes = list(phoneme_freq.keys())
unique_phonemes
Out[ ]:
['<sos>', 'W', 'EH', 'N', '<space>', 'Y', 'UW', 'AA', 'R', 'K', 'UH', 'IH', 'NG', 'CH', 'P', 'S', 'AE', 'T', 'HH', 'OW', 'M', '<eos>', 'DH', 'AH', 'D', 'SH', 'L', 'AO', 'F', 'EY', 'Z', 'TH', 'AY', 'B', 'V', 'AW', 'G', 'ER', 'IY', 'JH', 'OY', 'ZH']
In [ ]:
# Define viseme categories
viseme_dict = {
'aa': ['aa', 'aw', 'ay'], 'ah': ['ah'], 'ao': ['ao', 'oy', 'ow'],
'ch': ['jh', 'ch', 'sh', 'zh'], 'er': ['er'], 'ey': ['eh', 'ey', 'ae'],
'f': ['f', 'v'], 'iy': ['ih', 'iy'], 'k': ['k', 'g', 'ng', 'n'],
'p': ['p', 'b', 'm'], 't': ['t', 'd', 's', 'z', 'th', 'dh'],
'uh': ['uh', 'uw'], 'w': ['w', 'r', 'l', 'y', 'hh'],
'space': ['<space>'], 'sos': ['<sos>'], 'eos': ['<eos>']
}
phoneme_to_viseme = {phoneme: viseme for viseme, phonemes in viseme_dict.items() for phoneme in phonemes}
def phonemes_to_visemes(phonemes):
visemes = []
for phoneme in phonemes:
if phoneme in ['<sos>', '<eos>', '<space>']:
visemes.append(phoneme)
else:
phoneme = phoneme[:-1] if phoneme[-1].isdigit() else phoneme
viseme = phoneme_to_viseme.get(phoneme, 'unknown')
visemes.append(viseme)
return visemes
# Example DataFrame
df_check = pd.DataFrame({
'phonemes': [['<sos>', 'W', 'EH', 'N', '<space>', 'Y', 'UW', 'K', 'UH', 'IH', 'NG', 'CH', 'P', 'S', 'AE', 'T', 'HH', 'OW', 'M', '<eos>', 'DH', 'AH', 'R', 'D', 'SH', 'L', 'AO', 'F', 'EY', 'Z', 'AA', 'TH', 'AY', 'B', 'V', 'AW', 'G', 'ER', 'IY', 'JH', 'OY', 'ZH']]
})
# Convert phonemes to lowercase
df['phonemes'] = df['phonemes'].apply(lambda phonemes: [phoneme.lower() for phoneme in phonemes])
# Convert phonemes to visemes in df_expanded
df['visemes'] = df['phonemes'].apply(phonemes_to_visemes)
# Print the first few rows to check the results
print(df[['phonemes', 'visemes']].head())
# Visual Inspection
print(df[['phonemes', 'visemes']].sample(5))
# Mapping Consistency
phoneme_to_viseme = {}
inconsistencies = 0
for phonemes, visemes in zip(df['phonemes'], df['visemes']):
for phoneme, viseme in zip(phonemes, visemes):
phoneme = phoneme[:-1] if phoneme[-1].isdigit() else phoneme
if phoneme in phoneme_to_viseme:
if phoneme_to_viseme[phoneme] != viseme:
inconsistencies += 1
else:
phoneme_to_viseme[phoneme] = viseme
print(f'Number of inconsistencies in mapping: {inconsistencies}')
# Usage of Unknown Visemes
unknown_visemes_count = df['visemes'].apply(lambda x: x.count('unknown')).sum()
print(f'Number of unknown visemes: {unknown_visemes_count}')
phonemes \
0 [<sos>, w, eh, n, <space>, y, uw, <space>, aa,...
1 [<sos>, dh, ah, <space>, t, r, ah, d, ih, sh, ...
2 [<sos>, th, r, uw, <space>, w, ah, t, <space>,...
3 [<sos>, w, ih, ch, <space>, ih, n, v, aa, l, v...
4 [<sos>, ah, p, aa, r, t, <space>, f, r, ah, m,...
visemes
0 [<sos>, w, ey, k, <space>, w, uh, <space>, aa,...
1 [<sos>, t, ah, <space>, t, w, ah, t, iy, ch, a...
2 [<sos>, t, w, uh, <space>, w, ah, t, <space>, ...
3 [<sos>, w, iy, ch, <space>, iy, k, f, aa, w, f...
4 [<sos>, ah, p, aa, w, t, <space>, f, w, ah, p,...
phonemes \
16950 [<sos>, ih, z, <space>, n, aa, t, <space>, dh,...
9165 [<sos>, ae, z, <space>, dh, ah, <space>, d, ae...
17444 [<sos>, ih, t, <space>, w, aa, z, <space>, hh,...
1656 [<sos>, y, uw, <space>, hh, ae, v, <space>, g,...
40670 [<sos>, w, iy, <space>, aa, r, <space>, ae, s,...
visemes
16950 [<sos>, iy, t, <space>, k, aa, t, <space>, t, ...
9165 [<sos>, ey, t, <space>, t, ah, <space>, t, ey,...
17444 [<sos>, iy, t, <space>, w, aa, t, <space>, w, ...
1656 [<sos>, w, uh, <space>, w, ey, f, <space>, k, ...
40670 [<sos>, w, iy, <space>, aa, w, <space>, ey, t,...
Number of inconsistencies in mapping: 0
Number of unknown visemes: 0
In [ ]:
# Set display options
pd.set_option('display.max_rows', 5)
pd.set_option('display.max_colwidth', None)
# Display the first 5 rows
display(df[['phonemes', 'visemes']].head())
| phonemes | visemes | |
|---|---|---|
| 0 | [<sos>, w, eh, n, <space>, y, uw, <space>, aa, r, <space>, k, uh, k, ih, ng, <space>, ch, ih, p, s, <space>, ae, t, <space>, hh, ow, m, <eos>] | [<sos>, w, ey, k, <space>, w, uh, <space>, aa, w, <space>, k, uh, k, iy, k, <space>, ch, iy, p, t, <space>, ey, t, <space>, w, ao, p, <eos>] |
| 1 | [<sos>, dh, ah, <space>, t, r, ah, d, ih, sh, ah, n, ah, l, <space>, ch, ih, p, <space>, p, ae, n, <space>, ao, f, ah, n, <space>, s, t, ey, z, <space>, aa, n, <space>, dh, ah, <space>, sh, eh, l, f, <eos>] | [<sos>, t, ah, <space>, t, w, ah, t, iy, ch, ah, k, ah, w, <space>, ch, iy, p, <space>, p, ey, k, <space>, ao, f, ah, k, <space>, t, t, ey, t, <space>, aa, k, <space>, t, ah, <space>, ch, ey, w, f, <eos>] |
| 2 | [<sos>, th, r, uw, <space>, w, ah, t, <space>, dh, ey, <space>, k, ao, l, <space>, ah, <space>, n, ay, f, <space>, b, l, aa, k, <eos>] | [<sos>, t, w, uh, <space>, w, ah, t, <space>, t, ey, <space>, k, ao, w, <space>, ah, <space>, k, aa, f, <space>, p, w, aa, k, <eos>] |
| 3 | [<sos>, w, ih, ch, <space>, ih, n, v, aa, l, v, z, <space>, f, ay, r, ih, ng, <space>, ah, <space>, p, ah, t, ey, t, ow, <space>, d, aw, n, <space>, ah, <space>, p, ay, p, <eos>] | [<sos>, w, iy, ch, <space>, iy, k, f, aa, w, f, t, <space>, f, aa, w, iy, k, <space>, ah, <space>, p, ah, t, ey, t, ao, <space>, t, aa, k, <space>, ah, <space>, p, aa, p, <eos>] |
| 4 | [<sos>, ah, p, aa, r, t, <space>, f, r, ah, m, <space>, dh, ah, <space>, g, ow, l, d, ah, n, <space>, k, ah, l, aw, r, <space>, ah, n, d, <space>, dh, ah, <space>, d, ih, l, ih, sh, ah, s, <space>, f, l, ae, v, er, <eos>] | [<sos>, ah, p, aa, w, t, <space>, f, w, ah, p, <space>, t, ah, <space>, k, ao, w, t, ah, k, <space>, k, ah, w, aa, w, <space>, ah, k, t, <space>, t, ah, <space>, t, iy, w, iy, ch, ah, t, <space>, f, w, ey, f, er, <eos>] |
In [ ]:
# Calculate the distribution of visemes in the dataset
viseme_distribution = pd.Series([item for sublist in df['visemes'] for item in sublist]).value_counts()
# Set up the visualization parameters
sns.set_style("whitegrid")
sns.set_palette("coolwarm_r")
sns.set_context("talk")
# Calculate the percentage of each viseme in the dataset
viseme_percentage = (viseme_distribution / viseme_distribution.sum()) * 100
# Create a horizontal bar plot for the visemes
plt.figure(figsize=(14, 10))
ax = sns.barplot(y=viseme_distribution.index, x=viseme_distribution.values, orient="h", palette="viridis")
# Annotate each bar with the count and percentage of each viseme
for index, value in enumerate(viseme_distribution.values):
ax.text(value, index,
f'{value} ({viseme_percentage[index]:.1f}%)',
color='black', ha="left", va="center", fontsize=10)
plt.title('Distribution of Visemes in the Dataset', fontsize=16, fontweight='bold')
plt.ylabel('Viseme', fontsize=14)
plt.xlabel('Count', fontsize=14)
plt.show()
In [ ]:
# Extract unique phonemes and visemes from the dataframe
unique_phonemes = set([item for sublist in df['phonemes'] for item in sublist])
unique_visemes = set([item for sublist in df['visemes'] for item in sublist])
# Exclude the special tokens from the filtered list
exclude_tokens = ['<space>', '<sos>', '<eos>', 'space', 'sos', 'eos']
filtered_phonemes = [phoneme for phoneme in unique_phonemes if phoneme not in exclude_tokens]
filtered_visemes = [viseme for viseme in unique_visemes if viseme not in exclude_tokens]
# Efficiently indexing the confusion matrix
phoneme_index = {phoneme: idx for idx, phoneme in enumerate(filtered_phonemes)}
viseme_index = {viseme: idx for idx, viseme in enumerate(filtered_visemes)}
# Create a matrix for the filtered phonemes and visemes
confusion_matrix = np.zeros((len(filtered_phonemes), len(filtered_visemes)))
# Update the matrix based on the mappings in the dataset
for phonemes, visemes in zip(df['phonemes'], df['visemes']):
for phoneme, viseme in zip(phonemes, visemes):
if phoneme in phoneme_index and viseme in viseme_index:
i = phoneme_index[phoneme]
j = viseme_index[viseme]
confusion_matrix[i][j] += 1
# Plot the heatmap
plt.figure(figsize=(14, 10))
ax = sns.heatmap(confusion_matrix, annot=True, fmt=".0f", cmap="Blues",
xticklabels=filtered_visemes, yticklabels=filtered_phonemes,
annot_kws={"size": 12})
plt.title("Phoneme to Viseme Mapping Heatmap", fontsize=18, fontweight='bold')
plt.xlabel("Viseme", fontsize=16)
plt.ylabel("Phoneme", fontsize=16)
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)
plt.tight_layout()
plt.show()
In [ ]:
# Check length consistency between phonemes and visemes
length_consistency = df['phonemes'].str.len().equals(df['visemes'].str.len())
print(f'Length consistency: {length_consistency}')
# Calculate lengths
df['phoneme_length'] = df['phonemes'].apply(len)
df['viseme_length'] = df['visemes'].apply(len)
# Find mismatches
mismatches = df[df['phoneme_length'] != df['viseme_length']]
# Print the sentences, phonemes, and visemes for those rows
for _, row in mismatches.head().iterrows():
print(f"Sentence: {row['expanded_sentence']}")
print(f"Phonemes: {' '.join(row['phonemes'])}")
print(f"Visemes: {' '.join(row['visemes'])}")
print(f"Phoneme Length: {row['phoneme_length']}")
print(f"Viseme Length: {row['viseme_length']}\n")
# Display a sample of sentences, phonemes, and visemes for comparison
sample_comparison = df[['sentence', 'phonemes', 'visemes']].sample(5)
for _, row in sample_comparison.iterrows():
print(f"Sentence: {row['sentence']}")
print(f"Phonemes: {' '.join(row['phonemes'])}")
print(f"Visemes: {' '.join(row['visemes'])}\n")
Length consistency: True Sentence: we made some big bales as well Phonemes: <sos> w iy <space> m ey d <space> s ah m <space> b ih g <space> b ey l z <space> ae z <space> w eh l <eos> Visemes: <sos> w iy <space> p ey t <space> t ah p <space> p iy k <space> p ey w t <space> ey t <space> w ey w <eos> Sentence: they had places in london all through their lives too Phonemes: <sos> dh ey <space> hh ae d <space> p l ey s ah z <space> ih n <space> l ah n d ah n <space> ao l <space> th r uw <space> dh eh r <space> l ih v z <space> t uw <eos> Visemes: <sos> t ey <space> w ey t <space> p w ey t ah t <space> iy k <space> w ah k t ah k <space> ao w <space> t w uh <space> t ey w <space> w iy f t <space> t uh <eos> Sentence: the metropolitan cathedral Phonemes: <sos> dh ah <space> m eh t r ah p aa l ah t ah n <space> k ah th iy d r ah l <eos> Visemes: <sos> t ah <space> p ey t w ah p aa w ah t ah k <space> k ah t iy t w ah w <eos> Sentence: it gave that hint of sexuality Phonemes: <sos> ih t <space> g ey v <space> dh ae t <space> hh ih n t <space> ah v <space> s eh k sh uw ae l ah t iy <eos> Visemes: <sos> iy t <space> k ey f <space> t ey t <space> w iy k t <space> ah f <space> t ey k ch uh ey w ah t iy <eos> Sentence: we have our final two to play Phonemes: <sos> w iy <space> hh ae v <space> aw er <space> f ay n ah l <space> t uw <space> t uw <space> p l ey <eos> Visemes: <sos> w iy <space> w ey f <space> aa er <space> f aa k ah w <space> t uh <space> t uh <space> p w ey <eos>
In [ ]:
# Display Sample Comparisons
sample_df = df.sample(5)
for index, row in sample_df.iterrows():
print(f"Sentence {index + 1}: {row['sentence']}")
print(f"Phonemes: {' '.join(row['phonemes'])}")
print(f"Visemes: {' '.join(row['visemes'])}\n")
Sentence 14447: and it goes really high and every night Phonemes: <sos> ah n d <space> ih t <space> g ow z <space> r ih l iy <space> hh ay <space> ah n d <space> eh v er iy <space> n ay t <eos> Visemes: <sos> ah k t <space> iy t <space> k ao t <space> w iy w iy <space> w aa <space> ah k t <space> ey f er iy <space> k aa t <eos> Sentence 41803: she was very sensitive to the fact that monarchs could be replaced by this method Phonemes: <sos> sh iy <space> w aa z <space> v eh r iy <space> s eh n s ah t ih v <space> t uw <space> dh ah <space> f ae k t <space> dh ae t <space> m aa n aa r k s <space> k uh d <space> b iy <space> r iy p l ey s t <space> b ay <space> dh ih s <space> m eh th ah d <eos> Visemes: <sos> ch iy <space> w aa t <space> f ey w iy <space> t ey k t ah t iy f <space> t uh <space> t ah <space> f ey k t <space> t ey t <space> p aa k aa w k t <space> k uh t <space> p iy <space> w iy p w ey t t <space> p aa <space> t iy t <space> p ey t ah t <eos> Sentence 35827: i do not belong to any club Phonemes: <sos> ay <space> d uw <space> n aa t <space> b ih l ao ng <space> t uw <space> eh n iy <space> k l ah b <eos> Visemes: <sos> aa <space> t uh <space> k aa t <space> p iy w ao k <space> t uh <space> ey k iy <space> k w ah p <eos> Sentence 45508: what can be done to help farmers like james Phonemes: <sos> w ah t <space> k ae n <space> b iy <space> d ah n <space> t uw <space> hh eh l p <space> f aa r m er z <space> l ay k <space> jh ey m z <eos> Visemes: <sos> w ah t <space> k ey k <space> p iy <space> t ah k <space> t uh <space> w ey w p <space> f aa w p er t <space> w aa k <space> ch ey p t <eos> Sentence 24063: and dirac did not like to speak in french Phonemes: <sos> ah n d <space> d ih r ah k <space> d ih d <space> n aa t <space> l ay k <space> t uw <space> s p iy k <space> ih n <space> f r eh n ch <eos> Visemes: <sos> ah k t <space> t iy w ah k <space> t iy t <space> k aa t <space> w aa k <space> t uh <space> t p iy k <space> iy k <space> f w ey k ch <eos>
In [ ]:
import os
# Store the original directory
original_directory = os.getcwd()
# Change to the directory where phonemes.txt is located
os.chdir('/content/drive/MyDrive/Dissertation/')
# Revert back to the original directory
os.chdir(original_directory)
In [ ]:
print(df.columns)
Index(['sentence', 'word_count', 'original_sentence',
'word_count_after_conversion', 'changed', 'phonemes', 'num_spaces',
'phonemes_str', 'sentence_with_tokens', 'phoneme_count',
'tokenized_sentence', 'visemes', 'phoneme_length', 'viseme_length'],
dtype='object')
In [ ]:
df.head()
Out[ ]:
| sentence | word_count | original_sentence | word_count_after_conversion | changed | phonemes | num_spaces | phonemes_str | sentence_with_tokens | phoneme_count | tokenized_sentence | visemes | phoneme_length | viseme_length | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | when you are cooking chips at home | 7 | WHEN YOU'RE COOKING CHIPS AT HOME | 6 | False | [<sos>, w, eh, n, <space>, y, uw, <space>, aa, r, <space>, k, uh, k, ih, ng, <space>, ch, ih, p, s, <space>, ae, t, <space>, hh, ow, m, <eos>] | 6 | <sos> W EH N <space> Y UW <space> AA R <space> K UH K IH NG <space> CH IH P S <space> AE T <space> HH OW M <eos> | <sos> when <space> you <space> are <space> cooking <space> chips <space> at <space> home <eos> | 29 | <sos> when <space> you <space> are <space> cooking <space> chips <space> at <space> home <eos> | [<sos>, w, ey, k, <space>, w, uh, <space>, aa, w, <space>, k, uh, k, iy, k, <space>, ch, iy, p, t, <space>, ey, t, <space>, w, ao, p, <eos>] | 29 | 29 |
| 1 | the traditional chip pan often stays on the shelf | 9 | THE TRADITIONAL CHIP PAN OFTEN STAYS ON THE SHELF | 9 | False | [<sos>, dh, ah, <space>, t, r, ah, d, ih, sh, ah, n, ah, l, <space>, ch, ih, p, <space>, p, ae, n, <space>, ao, f, ah, n, <space>, s, t, ey, z, <space>, aa, n, <space>, dh, ah, <space>, sh, eh, l, f, <eos>] | 8 | <sos> DH AH <space> T R AH D IH SH AH N AH L <space> CH IH P <space> P AE N <space> AO F AH N <space> S T EY Z <space> AA N <space> DH AH <space> SH EH L F <eos> | <sos> the <space> traditional <space> chip <space> pan <space> often <space> stays <space> on <space> the <space> shelf <eos> | 44 | <sos> the <space> traditional <space> chip <space> pan <space> often <space> stays <space> on <space> the <space> shelf <eos> | [<sos>, t, ah, <space>, t, w, ah, t, iy, ch, ah, k, ah, w, <space>, ch, iy, p, <space>, p, ey, k, <space>, ao, f, ah, k, <space>, t, t, ey, t, <space>, aa, k, <space>, t, ah, <space>, ch, ey, w, f, <eos>] | 44 | 44 |
| 2 | through what they call a knife block | 7 | THROUGH WHAT THEY CALL A KNIFE BLOCK | 7 | False | [<sos>, th, r, uw, <space>, w, ah, t, <space>, dh, ey, <space>, k, ao, l, <space>, ah, <space>, n, ay, f, <space>, b, l, aa, k, <eos>] | 6 | <sos> TH R UW <space> W AH T <space> DH EY <space> K AO L <space> AH <space> N AY F <space> B L AA K <eos> | <sos> through <space> what <space> they <space> call <space> a <space> knife <space> block <eos> | 27 | <sos> through <space> what <space> they <space> call <space> a <space> knife <space> block <eos> | [<sos>, t, w, uh, <space>, w, ah, t, <space>, t, ey, <space>, k, ao, w, <space>, ah, <space>, k, aa, f, <space>, p, w, aa, k, <eos>] | 27 | 27 |
| 3 | which involves firing a potato down a pipe | 8 | WHICH INVOLVES FIRING A POTATO DOWN A PIPE | 8 | False | [<sos>, w, ih, ch, <space>, ih, n, v, aa, l, v, z, <space>, f, ay, r, ih, ng, <space>, ah, <space>, p, ah, t, ey, t, ow, <space>, d, aw, n, <space>, ah, <space>, p, ay, p, <eos>] | 7 | <sos> W IH CH <space> IH N V AA L V Z <space> F AY R IH NG <space> AH <space> P AH T EY T OW <space> D AW N <space> AH <space> P AY P <eos> | <sos> which <space> involves <space> firing <space> a <space> potato <space> down <space> a <space> pipe <eos> | 38 | <sos> which <space> involves <space> firing <space> a <space> potato <space> down <space> a <space> pipe <eos> | [<sos>, w, iy, ch, <space>, iy, k, f, aa, w, f, t, <space>, f, aa, w, iy, k, <space>, ah, <space>, p, ah, t, ey, t, ao, <space>, t, aa, k, <space>, ah, <space>, p, aa, p, <eos>] | 38 | 38 |
| 4 | apart from the golden colour and the delicious flavour | 9 | APART FROM THE GOLDEN COLOUR AND THE DELICIOUS FLAVOUR | 9 | False | [<sos>, ah, p, aa, r, t, <space>, f, r, ah, m, <space>, dh, ah, <space>, g, ow, l, d, ah, n, <space>, k, ah, l, aw, r, <space>, ah, n, d, <space>, dh, ah, <space>, d, ih, l, ih, sh, ah, s, <space>, f, l, ae, v, er, <eos>] | 8 | <sos> AH P AA R T <space> F R AH M <space> DH AH <space> G OW L D AH N <space> K AH L AW R <space> AH N D <space> DH AH <space> D IH L IH SH AH S <space> F L AE V ER <eos> | <sos> apart <space> from <space> the <space> golden <space> colour <space> and <space> the <space> delicious <space> flavour <eos> | 49 | <sos> apart <space> from <space> the <space> golden <space> colour <space> and <space> the <space> delicious <space> flavour <eos> | [<sos>, ah, p, aa, w, t, <space>, f, w, ah, p, <space>, t, ah, <space>, k, ao, w, t, ah, k, <space>, k, ah, w, aa, w, <space>, ah, k, t, <space>, t, ah, <space>, t, iy, w, iy, ch, ah, t, <space>, f, w, ey, f, er, <eos>] | 49 | 49 |
In [ ]:
df.shape
Out[ ]:
(45700, 14)
In [ ]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
# Prepare the viseme data
viseme_tokenizer = Tokenizer(filters='', lower=False, split=' ')
viseme_tokenizer.fit_on_texts(df['visemes'])
viseme_sequences = viseme_tokenizer.texts_to_sequences(df['visemes'])
viseme_MAX_LEN = max(len(seq) for seq in viseme_sequences)
X_data = pad_sequences(viseme_sequences, maxlen=viseme_MAX_LEN, padding='post')
# Prepare the sentence data
sentence_tokenizer = Tokenizer(filters='!"#$%&()*+,-./:;=?@[\\]^_`{|}~\t\n')
sentence_tokenizer.fit_on_texts(df['sentence_with_tokens'])
sentence_sequences = sentence_tokenizer.texts_to_sequences(df['sentence_with_tokens'])
sentence_MAX_LEN = max(len(seq) for seq in sentence_sequences)
y_data = pad_sequences(sentence_sequences, maxlen=sentence_MAX_LEN, padding='post')
print("X_data:\n", X_data[:5])
print("\ny_data:\n", y_data[:5])
# Check if the special tokens <sos>, <space>, and <eos> are included in the tokenized sequences
special_tokens = ['<sos>', '<space>', '<eos>']
for token in special_tokens:
token_index = viseme_tokenizer.word_index[token]
token_in_X_data = any(token_index in seq for seq in X_data)
token_in_y_data = any(token_index in seq for seq in y_data)
print(f"\nIs '{token}' included in X_data? {token_in_X_data}")
print(f"Is '{token}' included in y_data? {token_in_y_data}")
X_data:
[[10 3 7 4 1 3 14 1 9 3 1 4 14 4 5 4 1 16 5 8 2 1 7 2
1 3 13 8 11 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0]
[10 2 6 1 2 3 6 2 5 16 6 4 6 3 1 16 5 8 1 8 7 4 1 13
12 6 4 1 2 2 7 2 1 9 4 1 2 6 1 16 7 3 12 11 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0]
[10 2 3 14 1 3 6 2 1 2 7 1 4 13 3 1 6 1 4 9 12 1 8 3
9 4 11 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0]
[10 3 5 16 1 5 4 12 9 3 12 2 1 12 9 3 5 4 1 6 1 8 6 2
7 2 13 1 2 9 4 1 6 1 8 9 8 11 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0]
[10 6 8 9 3 2 1 12 3 6 8 1 2 6 1 4 13 3 2 6 4 1 4 6
3 9 3 1 6 4 2 1 2 6 1 2 5 3 5 16 6 2 1 12 3 7 12 15
11 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0]]
y_data:
[[ 2 49 1 13 1 20 1 997 1 1629 1 38 1 145
3 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0]
[ 2 4 1 1032 1 3014 1 4422 1 356 1 3334 1 22
1 4 1 4423 3 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0]
[ 2 160 1 28 1 21 1 313 1 8 1 3015 1 2148
3 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0]
[ 2 64 1 3817 1 3335 1 8 1 3818 1 115 1 8
1 4424 3 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0]
[ 2 509 1 42 1 4 1 2036 1 1101 1 6 1 4
1 2149 1 1809 3 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0]]
Is '<sos>' included in X_data? True
Is '<sos>' included in y_data? True
Is '<space>' included in X_data? True
Is '<space>' included in y_data? True
Is '<eos>' included in X_data? True
Is '<eos>' included in y_data? True
In [ ]:
# Seaborn Plot
sns.set_style("whitegrid")
sns.set_context("talk")
palette = ["#3498db", "#e74c3c"] # Blue and Red palette
# Boxplot Visualization
plt.figure(figsize=(16, 7))
# Boxplot for X_data (Viseme)
plt.subplot(1, 2, 1)
sns.boxplot(x=X_data.ravel(), color=palette[0])
plt.title('Boxplot of Encoded Values for Visemes', fontweight='bold')
plt.xlabel('Encoded Value', fontsize=14)
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)
# Boxplot for y_data (Sentences)
plt.subplot(1, 2, 2)
sns.boxplot(x=y_data.ravel(), color=palette[1])
plt.title('Boxplot of Encoded Values for Sentences', fontweight='bold')
plt.xlabel('Encoded Value', fontsize=14)
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)
plt.tight_layout()
plt.show()
In [ ]:
print(df.columns)
Index(['sentence', 'word_count', 'original_sentence',
'word_count_after_conversion', 'changed', 'phonemes', 'num_spaces',
'phonemes_str', 'sentence_with_tokens', 'phoneme_count',
'tokenized_sentence', 'visemes', 'phoneme_length', 'viseme_length'],
dtype='object')
In [ ]:
df.head()
Out[ ]:
| sentence | word_count | original_sentence | word_count_after_conversion | changed | phonemes | num_spaces | phonemes_str | sentence_with_tokens | phoneme_count | tokenized_sentence | visemes | phoneme_length | viseme_length | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | when you are cooking chips at home | 7 | WHEN YOU'RE COOKING CHIPS AT HOME | 6 | False | [<sos>, w, eh, n, <space>, y, uw, <space>, aa, r, <space>, k, uh, k, ih, ng, <space>, ch, ih, p, s, <space>, ae, t, <space>, hh, ow, m, <eos>] | 6 | <sos> W EH N <space> Y UW <space> AA R <space> K UH K IH NG <space> CH IH P S <space> AE T <space> HH OW M <eos> | <sos> when <space> you <space> are <space> cooking <space> chips <space> at <space> home <eos> | 29 | <sos> when <space> you <space> are <space> cooking <space> chips <space> at <space> home <eos> | [<sos>, w, ey, k, <space>, w, uh, <space>, aa, w, <space>, k, uh, k, iy, k, <space>, ch, iy, p, t, <space>, ey, t, <space>, w, ao, p, <eos>] | 29 | 29 |
| 1 | the traditional chip pan often stays on the shelf | 9 | THE TRADITIONAL CHIP PAN OFTEN STAYS ON THE SHELF | 9 | False | [<sos>, dh, ah, <space>, t, r, ah, d, ih, sh, ah, n, ah, l, <space>, ch, ih, p, <space>, p, ae, n, <space>, ao, f, ah, n, <space>, s, t, ey, z, <space>, aa, n, <space>, dh, ah, <space>, sh, eh, l, f, <eos>] | 8 | <sos> DH AH <space> T R AH D IH SH AH N AH L <space> CH IH P <space> P AE N <space> AO F AH N <space> S T EY Z <space> AA N <space> DH AH <space> SH EH L F <eos> | <sos> the <space> traditional <space> chip <space> pan <space> often <space> stays <space> on <space> the <space> shelf <eos> | 44 | <sos> the <space> traditional <space> chip <space> pan <space> often <space> stays <space> on <space> the <space> shelf <eos> | [<sos>, t, ah, <space>, t, w, ah, t, iy, ch, ah, k, ah, w, <space>, ch, iy, p, <space>, p, ey, k, <space>, ao, f, ah, k, <space>, t, t, ey, t, <space>, aa, k, <space>, t, ah, <space>, ch, ey, w, f, <eos>] | 44 | 44 |
| 2 | through what they call a knife block | 7 | THROUGH WHAT THEY CALL A KNIFE BLOCK | 7 | False | [<sos>, th, r, uw, <space>, w, ah, t, <space>, dh, ey, <space>, k, ao, l, <space>, ah, <space>, n, ay, f, <space>, b, l, aa, k, <eos>] | 6 | <sos> TH R UW <space> W AH T <space> DH EY <space> K AO L <space> AH <space> N AY F <space> B L AA K <eos> | <sos> through <space> what <space> they <space> call <space> a <space> knife <space> block <eos> | 27 | <sos> through <space> what <space> they <space> call <space> a <space> knife <space> block <eos> | [<sos>, t, w, uh, <space>, w, ah, t, <space>, t, ey, <space>, k, ao, w, <space>, ah, <space>, k, aa, f, <space>, p, w, aa, k, <eos>] | 27 | 27 |
| 3 | which involves firing a potato down a pipe | 8 | WHICH INVOLVES FIRING A POTATO DOWN A PIPE | 8 | False | [<sos>, w, ih, ch, <space>, ih, n, v, aa, l, v, z, <space>, f, ay, r, ih, ng, <space>, ah, <space>, p, ah, t, ey, t, ow, <space>, d, aw, n, <space>, ah, <space>, p, ay, p, <eos>] | 7 | <sos> W IH CH <space> IH N V AA L V Z <space> F AY R IH NG <space> AH <space> P AH T EY T OW <space> D AW N <space> AH <space> P AY P <eos> | <sos> which <space> involves <space> firing <space> a <space> potato <space> down <space> a <space> pipe <eos> | 38 | <sos> which <space> involves <space> firing <space> a <space> potato <space> down <space> a <space> pipe <eos> | [<sos>, w, iy, ch, <space>, iy, k, f, aa, w, f, t, <space>, f, aa, w, iy, k, <space>, ah, <space>, p, ah, t, ey, t, ao, <space>, t, aa, k, <space>, ah, <space>, p, aa, p, <eos>] | 38 | 38 |
| 4 | apart from the golden colour and the delicious flavour | 9 | APART FROM THE GOLDEN COLOUR AND THE DELICIOUS FLAVOUR | 9 | False | [<sos>, ah, p, aa, r, t, <space>, f, r, ah, m, <space>, dh, ah, <space>, g, ow, l, d, ah, n, <space>, k, ah, l, aw, r, <space>, ah, n, d, <space>, dh, ah, <space>, d, ih, l, ih, sh, ah, s, <space>, f, l, ae, v, er, <eos>] | 8 | <sos> AH P AA R T <space> F R AH M <space> DH AH <space> G OW L D AH N <space> K AH L AW R <space> AH N D <space> DH AH <space> D IH L IH SH AH S <space> F L AE V ER <eos> | <sos> apart <space> from <space> the <space> golden <space> colour <space> and <space> the <space> delicious <space> flavour <eos> | 49 | <sos> apart <space> from <space> the <space> golden <space> colour <space> and <space> the <space> delicious <space> flavour <eos> | [<sos>, ah, p, aa, w, t, <space>, f, w, ah, p, <space>, t, ah, <space>, k, ao, w, t, ah, k, <space>, k, ah, w, aa, w, <space>, ah, k, t, <space>, t, ah, <space>, t, iy, w, iy, ch, ah, t, <space>, f, w, ey, f, er, <eos>] | 49 | 49 |
In [ ]:
# Check the structure of the df['visemes'] column
print("First 5 entries in 'visemes' column:")
print(df['visemes'].head())
# Check the structure of the df['tokenized_sentence'] column
print("\nFirst 5 entries in 'tokenized_sentence' column:")
print(df['sentence_with_tokens'].head())
# Check if the special tokens <sos>, <space>, and <eos> are already included in the data
special_tokens = ['<sos>', '<space>', '<eos>']
for token in special_tokens:
token_in_visemes = df['visemes'].apply(lambda x: token in x).any()
token_in_tokenized_sentence = df['sentence_with_tokens'].apply(lambda x: token in x).any()
print(f"\nIs '{token}' included in 'visemes' column? {token_in_visemes}")
print(f"Is '{token}' included in 'tokenized_sentence' column? {token_in_tokenized_sentence}")
First 5 entries in 'visemes' column: 0 [<sos>, w, ey, k, <space>, w, uh, <space>, aa, w, <space>, k, uh, k, iy, k, <space>, ch, iy, p, t, <space>, ey, t, <space>, w, ao, p, <eos>] 1 [<sos>, t, ah, <space>, t, w, ah, t, iy, ch, ah, k, ah, w, <space>, ch, iy, p, <space>, p, ey, k, <space>, ao, f, ah, k, <space>, t, t, ey, t, <space>, aa, k, <space>, t, ah, <space>, ch, ey, w, f, <eos>] 2 [<sos>, t, w, uh, <space>, w, ah, t, <space>, t, ey, <space>, k, ao, w, <space>, ah, <space>, k, aa, f, <space>, p, w, aa, k, <eos>] 3 [<sos>, w, iy, ch, <space>, iy, k, f, aa, w, f, t, <space>, f, aa, w, iy, k, <space>, ah, <space>, p, ah, t, ey, t, ao, <space>, t, aa, k, <space>, ah, <space>, p, aa, p, <eos>] 4 [<sos>, ah, p, aa, w, t, <space>, f, w, ah, p, <space>, t, ah, <space>, k, ao, w, t, ah, k, <space>, k, ah, w, aa, w, <space>, ah, k, t, <space>, t, ah, <space>, t, iy, w, iy, ch, ah, t, <space>, f, w, ey, f, er, <eos>] Name: visemes, dtype: object First 5 entries in 'tokenized_sentence' column: 0 <sos> when <space> you <space> are <space> cooking <space> chips <space> at <space> home <eos> 1 <sos> the <space> traditional <space> chip <space> pan <space> often <space> stays <space> on <space> the <space> shelf <eos> 2 <sos> through <space> what <space> they <space> call <space> a <space> knife <space> block <eos> 3 <sos> which <space> involves <space> firing <space> a <space> potato <space> down <space> a <space> pipe <eos> 4 <sos> apart <space> from <space> the <space> golden <space> colour <space> and <space> the <space> delicious <space> flavour <eos> Name: sentence_with_tokens, dtype: object Is '<sos>' included in 'visemes' column? True Is '<sos>' included in 'tokenized_sentence' column? True Is '<space>' included in 'visemes' column? True Is '<space>' included in 'tokenized_sentence' column? True Is '<eos>' included in 'visemes' column? True Is '<eos>' included in 'tokenized_sentence' column? True
In [ ]:
# Prepare the viseme and sentence data
viseme_tokenizer = Tokenizer(filters='', lower=False, split=' ')
viseme_tokenizer.fit_on_texts(df['visemes'])
viseme_sequences = viseme_tokenizer.texts_to_sequences(df['visemes'])
viseme_MAX_LEN = max(len(seq) for seq in viseme_sequences)
X_data = pad_sequences(viseme_sequences, maxlen=viseme_MAX_LEN, padding='post')
sentence_tokenizer = Tokenizer(filters='!"#$%&()*+,-./:;=?@[\\]^_`{|}~\t\n')
sentence_tokenizer.fit_on_texts(df['sentence_with_tokens'])
sentence_sequences = sentence_tokenizer.texts_to_sequences(df['sentence_with_tokens'])
sentence_MAX_LEN = max(len(seq) for seq in sentence_sequences)
y_data = pad_sequences(sentence_sequences, maxlen=sentence_MAX_LEN, padding='post')
In [ ]:
# Calculate lengths for each sequence in viseme and sentence sequences
viseme_lengths = [len(seq) for seq in viseme_sequences]
sentence_lengths = [len(seq) for seq in sentence_sequences]
# Descriptive Statistics
print("=== Viseme Sequences ===")
print(f"Average Length: {np.mean(viseme_lengths)}")
print(f"Minimum Length: {np.min(viseme_lengths)}")
print(f"Maximum Length: {np.max(viseme_lengths)}")
print("\n")
print("=== Sentence Sequences ===")
print(f"Average Length: {np.mean(sentence_lengths)}")
print(f"Minimum Length: {np.min(sentence_lengths)}")
print(f"Maximum Length: {np.max(sentence_lengths)}")
print("\n")
# Token Frequency
viseme_freq = pd.Series([item for sublist in viseme_sequences for item in sublist]).value_counts()
sentence_freq = pd.Series([item for sublist in sentence_sequences for item in sublist]).value_counts()
print("=== Most Frequent Visemes ===")
print(viseme_freq.head(10))
print("\n")
print("=== Most Frequent Words ===")
print(sentence_freq.head(10))
print("\n")
# Special Tokens
for token in special_tokens:
viseme_token_count = sum([seq.count(viseme_tokenizer.word_index[token]) for seq in viseme_sequences])
sentence_token_count = sum([seq.count(sentence_tokenizer.word_index[token]) for seq in sentence_sequences])
print(f"Occurrences of '{token}' in viseme sequences: {viseme_token_count}")
print(f"Occurrences of '{token}' in sentence sequences: {sentence_token_count}")
print("\n")
=== Viseme Sequences ===
Average Length: 34.041969365426695
Minimum Length: 11
Maximum Length: 109
=== Sentence Sequences ===
Average Length: 15.970196936542669
Minimum Length: 7
Maximum Length: 53
=== Most Frequent Visemes ===
1 297858
2 265845
...
9 60570
10 45700
Length: 10, dtype: int64
=== Most Frequent Words ===
1 296369
2 45700
...
9 7996
10 7692
Length: 10, dtype: int64
Occurrences of '<sos>' in viseme sequences: 45700
Occurrences of '<sos>' in sentence sequences: 45700
Occurrences of '<space>' in viseme sequences: 297858
Occurrences of '<space>' in sentence sequences: 296369
Occurrences of '<eos>' in viseme sequences: 45700
Occurrences of '<eos>' in sentence sequences: 45700
In [ ]:
# Split data into train and test sets
X_train, X_test, y_train, y_test, train_indices, test_indices = train_test_split(
X_data, y_data, range(len(X_data)), test_size=0.2, random_state=42
)
# Create TensorFlow Dataset objects
batch_size = 64
train_dataset = tf.data.Dataset.from_tensor_slices(((X_train, y_train), y_train)).batch(batch_size).prefetch(buffer_size=tf.data.AUTOTUNE)
test_dataset = tf.data.Dataset.from_tensor_slices(((X_test, y_test), y_test)).batch(batch_size).prefetch(buffer_size=tf.data.AUTOTUNE)
# Define the model
embedding_dim = 128
units = 256
# Encoder
encoder_inputs = Input(shape=(viseme_MAX_LEN,))
encoder_embedding_layer = Embedding(input_dim=len(viseme_tokenizer.word_index) + 1, output_dim=embedding_dim)
encoder_embedding = encoder_embedding_layer(encoder_inputs)
encoder_gru = GRU(units, return_sequences=True, return_state=True)
encoder_outputs, encoder_state = encoder_gru(encoder_embedding)
# Decoder
decoder_inputs = Input(shape=(sentence_MAX_LEN,))
decoder_embedding_layer = Embedding(input_dim=len(sentence_tokenizer.word_index) + 1, output_dim=embedding_dim)
decoder_embedding = decoder_embedding_layer(decoder_inputs)
decoder_gru = GRU(units, return_sequences=True)
decoder_outputs = decoder_gru(decoder_embedding, initial_state=encoder_state)
# Attention
attention = Attention()
context_vector = attention([decoder_outputs, encoder_outputs])
# Concatenate context vector and decoder output
decoder_combined = tf.concat([context_vector, decoder_outputs], axis=-1)
# Dense layer
decoder_dense = Dense(len(sentence_tokenizer.word_index) + 1, activation='softmax')
decoder_outputs = decoder_dense(decoder_combined)
# Model
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
model.compile(optimizer=Adam(), loss='sparse_categorical_crossentropy', metrics=['accuracy'])
# Define early stopping callback
early_stopping_callback = EarlyStopping(monitor='val_loss', patience=5)
# Train the model
history = model.fit(
[X_train, y_train],
y_train,
batch_size=batch_size,
epochs=5,
validation_data=([X_test, y_test], y_test),
callbacks=[early_stopping_callback]
)
# Evaluate the model on the test set
test_loss, test_acc = model.evaluate([X_test, y_test], y_test)
print("Test Loss:", test_loss)
print("Test Accuracy:", test_acc)
Epoch 1/5 572/572 [==============================] - 376s 651ms/step - loss: 1.1226 - accuracy: 0.8587 - val_loss: 0.5126 - val_accuracy: 0.9333 Epoch 2/5 572/572 [==============================] - 370s 647ms/step - loss: 0.3519 - accuracy: 0.9546 - val_loss: 0.2554 - val_accuracy: 0.9695 Epoch 3/5 572/572 [==============================] - 365s 638ms/step - loss: 0.1824 - accuracy: 0.9781 - val_loss: 0.1546 - val_accuracy: 0.9835 Epoch 4/5 572/572 [==============================] - 365s 637ms/step - loss: 0.1049 - accuracy: 0.9875 - val_loss: 0.1117 - val_accuracy: 0.9891 Epoch 5/5 572/572 [==============================] - 363s 635ms/step - loss: 0.0640 - accuracy: 0.9920 - val_loss: 0.0937 - val_accuracy: 0.9916 286/286 [==============================] - 49s 172ms/step - loss: 0.0937 - accuracy: 0.9916 Test Loss: 0.09373828768730164 Test Accuracy: 0.9915651679039001
In [ ]:
# 1. Advanced Training and Validation Loss Curve
plt.figure(figsize=(12, 6))
plt.plot(history.history['loss'], label='Training Loss', color='blue', linestyle='--')
plt.plot(history.history['val_loss'], label='Validation Loss', color='red')
plt.scatter(np.argmin(history.history['val_loss']), min(history.history['val_loss']), s=100, c='red', marker='o')
plt.title('Advanced Training and Validation Loss over Epochs')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.grid(True, which='both', linestyle='--', linewidth=0.5)
plt.tight_layout()
plt.show()
# 2. Advanced Training and Validation Accuracy Curve
plt.figure(figsize=(12, 6))
plt.plot(history.history['accuracy'], label='Training Accuracy', color='blue', linestyle='--')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy', color='purple')
plt.scatter(np.argmax(history.history['val_accuracy']), max(history.history['val_accuracy']), s=100, c='purple', marker='o')
plt.title('Advanced Training and Validation Accuracy over Epochs')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()
plt.grid(True, which='both', linestyle='--', linewidth=0.5)
plt.tight_layout()
plt.show()
# 3. Model Architecture Visualization
plot_model(model, to_file='advanced_model_plot.png', show_shapes=True, show_layer_names=True, expand_nested=True)
plt.figure(figsize=(20, 20))
img = plt.imread('advanced_model_plot.png')
plt.imshow(img)
plt.axis('off')
plt.title('Advanced Model Architecture Visualization')
plt.show()
# 4. Advanced Final Test Loss and Accuracy
plt.figure(figsize=(10, 6))
bar_width = 0.35
index = np.arange(2)
bars1 = [test_loss, history.history['val_loss'][-1]]
bars2 = [test_acc, history.history['val_accuracy'][-1]]
rects1 = plt.bar(index, bars1, bar_width, label='Test', color='blue', alpha=0.8)
rects2 = plt.bar(index + bar_width, bars2, bar_width, label='Validation (Final Epoch)', color='green', alpha=0.8)
plt.xlabel('Metrics')
plt.ylabel('Values')
plt.title('Test Loss and Accuracy vs. Validation Metrics')
plt.xticks(index + bar_width / 2, ('Loss', 'Accuracy'))
plt.legend()
plt.tight_layout()
plt.show()
In [ ]:
print(test_indices[:10])
print([df['visemes'].iloc[idx] for idx in test_indices[:10]])
[29468, 44953, 23062, 43578, 2182, 26391, 21035, 35684, 39308, 1911] [['<sos>', 'w', 'iy', '<space>', 'k', 'iy', 't', '<space>', 'w', 'aa', 'w', 't', '<space>', 'f', 'ey', 'k', 't', 't', '<eos>'], ['<sos>', 't', 'ey', 't', '<space>', 'p', 'iy', 'k', 't', '<space>', 'f', 'ao', 'w', '<space>', 't', 'ah', '<space>', 'f', 'er', 't', 't', '<space>', 't', 'aa', 'p', '<space>', 't', 'iy', 'k', 't', '<space>', 'w', 'iy', '<space>', 'ch', 'ao', 'k', 't', '<space>', 't', 'ah', '<space>', 'w', 'uh', '<eos>'], ['<sos>', 'aa', '<space>', 't', 'iy', 'k', 'k', '<space>', 't', 'iy', 't', '<space>', 'iy', 't', '<space>', 'ah', '<space>', 'w', 'iy', 'w', 'iy', '<space>', 'k', 'uh', 't', '<space>', 'w', 'iy', 't', 'ah', 'w', '<space>', 'f', 'w', 'ey', 't', '<eos>'], ['<sos>', 'p', 'ey', 't', 't', '<space>', 'aa', 'k', '<space>', 't', 'ah', '<space>', 'f', 'ey', 'k', 't', '<space>', 'aa', '<space>', 'w', 'ey', 'f', '<space>', 'w', 'er', 'k', 't', '<space>', 'w', 'iy', 't', '<space>', 'w', 'w', 'uh', '<space>', 'f', 'ao', 'w', '<space>', 'w', 'ao', 'k', 'k', 'er', '<eos>'], ['<sos>', 't', 'ah', '<space>', 'ah', 't', 'er', 't', '<space>', 'ao', 'k', 'w', 'iy', '<space>', 'k', 'ey', 't', '<space>', 'k', 'ey', 'p', 'ah', 'k', '<eos>'], ['<sos>', 'iy', 't', '<space>', 'iy', 't', '<space>', 'k', 'aa', 't', '<space>', 'ao', 'f', 'ah', 'k', '<space>', 'aa', '<space>', 'w', 'uh', 't', '<space>', 'p', 'aa', '<space>', 't', 'ey', 'p', 'p', 'er', '<eos>'], ['<sos>', 't', 'iy', 't', '<space>', 'w', 'aa', 't', '<space>', 't', 'ah', '<space>', 't', 'aa', 'p', '<space>', 'iy', 'k', '<space>', 'w', 'iy', 'ch', '<eos>'], ['<sos>', 't', 'ah', '<space>', 'ch', 'ao', '<space>', 'w', 'ey', 'w', '<space>', 'ah', '<space>', 't', 'iy', 'p', '<space>', 'ah', 'f', '<space>', 'f', 'aa', 'f', '<space>', 'k', 'w', 'iy', 't', '<space>', 'ch', 'ey', 'w', 'ah', 'k', 'ch', 'er', 't', '<space>', 'p', 'iy', 't', '<space>', 't', 'ey', 'w', '<space>', 'w', 'iy', 't', 't', '<space>', 'ah', 'k', 'ey', 'k', 't', 't', '<space>', 'p', 'aa', 't', 'ah', 'p', 'w', 'iy', '<space>', 't', 'ah', '<space>', 'k', 'w', 'ey', 't', 'ah', 't', 't', '<space>', 'k', 'w', 'iy', 't', '<eos>'], ['<sos>', 'ah', 'k', 't', '<space>', 't', 'ao', '<space>', 'w', 'iy', '<space>', 'w', 'ey', 'f', '<space>', 't', 'w', 'aa', 't', '<space>', 't', 'uh', '<space>', 'p', 'iy', '<space>', 'k', 'uh', 't', '<space>', 'p', 'ey', 'k', 'er', 't', '<space>', 'p', 'aa', '<space>', 't', 'ey', 'iy', 'k', '<space>', 't', 'ey', 'k', '<eos>'], ['<sos>', 'w', 'aa', 'w', '<space>', 'ey', 'k', 'iy', '<space>', 'ey', 't', '<space>', 't', 't', 'iy', 'w', '<space>', 'ah', '<space>', 'p', 'ey', 'p', '<eos>']]
In [ ]:
# Function to convert predicted token IDs to text
def sequences_to_texts(sequences, tokenizer):
texts = tokenizer.sequences_to_texts(sequences)
return texts
# Select a subset from the test data for evaluation
num_examples = 15
X_test_subset = X_test[:num_examples]
y_test_subset = y_test[:num_examples]
original_visemes = df['visemes'].tolist()
# Get the visemes for the selected test subset
test_visemes_subset = [original_visemes[idx] for idx in test_indices[:num_examples]]
# Generate predictions on the subset of the test set
predictions = model.predict([X_test_subset, y_test_subset])
# Convert predicted token IDs to text
predicted_sentences = sequences_to_texts(predictions.argmax(axis=-1), sentence_tokenizer)
# Convert original token IDs to text
original_sentences = sequences_to_texts(y_test_subset, sentence_tokenizer)
# Initialize WER, BLEU, and CER scores
wer_scores = []
bleu_scores = []
# Print the original sentences, predicted sentences, and visemes side by side
for original, predicted, viseme_seq in zip(original_sentences, predicted_sentences, test_visemes_subset):
viseme_seq_text = ' '.join(viseme_seq)
print(f"Original: {original}\nPredicted: {predicted}\nVisemes: {viseme_seq_text}")
# Calculate WER
wer = jiwer.wer(original, predicted)
wer_scores.append(wer)
print(f"WER: {wer:.4f}")
# Tokenize sentences for BLEU score calculation
original_tokens = original.split()
predicted_tokens = predicted.split()
# Calculate BLEU score
bleu_score = corpus_bleu([[original_tokens]], [predicted_tokens], smoothing_function=SmoothingFunction().method3)
bleu_scores.append(bleu_score)
print(f"BLEU Score: {bleu_score:.4f}")
print("-" * 50)
# Calculate average WER and BLEU scores
average_wer = sum(wer_scores) / len(wer_scores)
average_bleu = sum(bleu_scores) / len(bleu_scores)
print(f"Average WER: {average_wer:.4f}")
print(f"Average BLEU Score: {average_bleu:.4f}")
1/1 [==============================] - 1s 812ms/step Original: <sos> we <space> need <space> hard <space> facts <eos> Predicted: <sos> we <space> need <space> hard <space> facts <eos> Visemes: <sos> w iy <space> k iy t <space> w aa w t <space> f ey k t t <eos> WER: 0.0000 BLEU Score: 1.0000 -------------------------------------------------- Original: <sos> that <space> means <space> for <space> the <space> first <space> time <space> since <space> we <space> joined <space> the <space> eu <eos> Predicted: <sos> that <space> means <space> for <space> the <space> first <space> time <space> since <space> we <space> joined <space> the <space> eu <eos> Visemes: <sos> t ey t <space> p iy k t <space> f ao w <space> t ah <space> f er t t <space> t aa p <space> t iy k t <space> w iy <space> ch ao k t <space> t ah <space> w uh <eos> WER: 0.0000 BLEU Score: 1.0000 -------------------------------------------------- Original: <sos> i <space> think <space> this <space> is <space> a <space> really <space> good <space> little <space> flat <eos> Predicted: <sos> i <space> think <space> this <space> is <space> a <space> really <space> good <space> little <space> flat <eos> Visemes: <sos> aa <space> t iy k k <space> t iy t <space> iy t <space> ah <space> w iy w iy <space> k uh t <space> w iy t ah w <space> f w ey t <eos> WER: 0.0000 BLEU Score: 1.0000 -------------------------------------------------- Original: <sos> based <space> on <space> the <space> fact <space> i <space> have <space> worked <space> with <space> hugh <space> for <space> longer <eos> Predicted: <sos> based <space> on <space> the <space> fact <space> i <space> have <space> worked <space> with <space> launch <space> for <space> longer <eos> Visemes: <sos> p ey t t <space> aa k <space> t ah <space> f ey k t <space> aa <space> w ey f <space> w er k t <space> w iy t <space> w w uh <space> f ao w <space> w ao k k er <eos> WER: 0.0435 BLEU Score: 0.8787 -------------------------------------------------- Original: <sos> the <space> others <space> only <space> get <space> gammon <eos> Predicted: <sos> the <space> others <space> only <space> get <space> tim <eos> Visemes: <sos> t ah <space> ah t er t <space> ao k w iy <space> k ey t <space> k ey p ah k <eos> WER: 0.0909 BLEU Score: 0.8071 -------------------------------------------------- Original: <sos> it <space> is <space> not <space> often <space> i <space> lose <space> my <space> temper <eos> Predicted: <sos> it <space> is <space> not <space> often <space> i <space> lose <space> my <space> nickname <eos> Visemes: <sos> iy t <space> iy t <space> k aa t <space> ao f ah k <space> aa <space> w uh t <space> p aa <space> t ey p p er <eos> WER: 0.0588 BLEU Score: 0.8844 -------------------------------------------------- Original: <sos> this <space> was <space> the <space> time <space> in <space> which <eos> Predicted: <sos> this <space> was <space> the <space> time <space> in <space> which <eos> Visemes: <sos> t iy t <space> w aa t <space> t ah <space> t aa p <space> iy k <space> w iy ch <eos> WER: 0.0000 BLEU Score: 1.0000 -------------------------------------------------- Original: <sos> the <space> show <space> where <space> a <space> team <space> of <space> five <space> quiz <space> challengers <space> pit <space> their <space> wits <space> against <space> possibly <space> the <space> greatest <space> quiz <eos> Predicted: <sos> the <space> show <space> where <space> a <space> team <space> of <space> five <space> quiz <space> challengers <space> pit <space> their <space> wits <space> against <space> possibly <space> the <space> greatest <space> quiz <eos> Visemes: <sos> t ah <space> ch ao <space> w ey w <space> ah <space> t iy p <space> ah f <space> f aa f <space> k w iy t <space> ch ey w ah k ch er t <space> p iy t <space> t ey w <space> w iy t t <space> ah k ey k t t <space> p aa t ah p w iy <space> t ah <space> k w ey t ah t t <space> k w iy t <eos> WER: 0.0000 BLEU Score: 1.0000 -------------------------------------------------- Original: <sos> and <space> so <space> we <space> have <space> tried <space> to <space> be <space> good <space> mannered <space> by <space> saying <space> ten <eos> Predicted: <sos> and <space> so <space> we <space> have <space> tried <space> to <space> be <space> good <space> topical <space> by <space> saying <space> ten <eos> Visemes: <sos> ah k t <space> t ao <space> w iy <space> w ey f <space> t w aa t <space> t uh <space> p iy <space> k uh t <space> p ey k er t <space> p aa <space> t ey iy k <space> t ey k <eos> WER: 0.0400 BLEU Score: 0.8895 -------------------------------------------------- Original: <sos> while <space> annie's <space> still <space> a <space> babe <eos> Predicted: <sos> while <space> tim <space> still <space> a <space> peake <eos> Visemes: <sos> w aa w <space> ey k iy <space> ey t <space> t t iy w <space> ah <space> p ey p <eos> WER: 0.1818 BLEU Score: 0.4833 -------------------------------------------------- Original: <sos> which <space> was <space> the <space> area <space> that <space> they <space> wanted <space> us <space> to <space> take <space> the <space> casualty <eos> Predicted: <sos> which <space> was <space> the <space> area <space> that <space> they <space> wanted <space> us <space> to <space> take <space> the <space> depth <eos> Visemes: <sos> w iy ch <space> w aa t <space> t ah <space> ey w iy ah <space> t ey t <space> t ey <space> w aa k t ah t <space> ah t <space> t uh <space> t ey k <space> t ah <space> k ey ch ah w ah w t iy <eos> WER: 0.0400 BLEU Score: 0.9245 -------------------------------------------------- Original: <sos> who <space> do <space> i <space> see <space> about <space> a <space> death <space> certificate <eos> Predicted: <sos> who <space> do <space> i <space> see <space> about <space> a <space> death <space> certificate <eos> Visemes: <sos> w uh <space> t uh <space> aa <space> t iy <space> ah p aa t <space> ah <space> t ey t <space> t er t iy f iy k ah t <eos> WER: 0.0000 BLEU Score: 1.0000 -------------------------------------------------- Original: <sos> reported <space> back <space> to <space> his <space> cabinet <eos> Predicted: <sos> reported <space> back <space> to <space> his <space> cabinet <eos> Visemes: <sos> w iy p ao w t ah t <space> p ey k <space> t uh <space> w iy t <space> k ey p ah k ah t <eos> WER: 0.0000 BLEU Score: 1.0000 -------------------------------------------------- Original: <sos> you <space> want <space> to <space> say <space> to <space> people <space> round <space> the <space> dining <space> table <eos> Predicted: <sos> you <space> want <space> to <space> say <space> to <space> people <space> round <space> the <space> dining <space> table <eos> Visemes: <sos> w uh <space> w aa k t <space> t uh <space> t ey <space> t uh <space> p iy p ah w <space> w aa k t <space> t ah <space> t aa k iy k <space> t ey p ah w <eos> WER: 0.0000 BLEU Score: 1.0000 -------------------------------------------------- Original: <sos> sold <space> in <space> aid <space> of <space> children <space> in <space> need <eos> Predicted: <sos> sold <space> in <space> aid <space> of <space> children <space> in <space> need <eos> Visemes: <sos> t ao w t <space> iy k <space> ey t <space> ah f <space> ch iy w t w ah k <space> iy k <space> k iy t <eos> WER: 0.0000 BLEU Score: 1.0000 -------------------------------------------------- Average WER: 0.0303 Average BLEU Score: 0.9245
In [ ]:
import matplotlib.pyplot as plt
# Create a list of set labels
sets = [f"Set {i+1}" for i in range(num_examples)]
# Create a figure and a set of subplots
fig, ax = plt.subplots(2, 1, figsize=(12, 10))
# Plot WER scores with markers
ax[0].plot(sets, wer_scores, color='steelblue', marker='o', label='WER per Set')
ax[0].axhline(average_wer, color='coral', linestyle='dashed', linewidth=1, label=f'Average WER: {average_wer:.4f}')
ax[0].set_title('Word Error Rate (WER) for Each Set')
ax[0].set_ylabel('WER')
ax[0].set_xticks(sets)
ax[0].set_xticklabels(sets, rotation=45)
ax[0].legend()
# Plot BLEU scores with markers
ax[1].plot(sets, bleu_scores, color='steelblue', marker='o', label='BLEU Score per Set')
ax[1].axhline(average_bleu, color='coral', linestyle='dashed', linewidth=1, label=f'Average BLEU: {average_bleu:.4f}')
ax[1].set_title('BLEU Score for Each Set')
ax[1].set_ylabel('BLEU Score')
ax[1].set_xticks(sets)
ax[1].set_xticklabels(sets, rotation=45)
ax[1].legend()
# Adjust the layout
plt.tight_layout()
plt.show()
In [ ]: