!pip install pyenchant contractions g2p-en cmudict jiwer
import nltk
!pip install -U nltk
nltk.download('averaged_perceptron_tagger')
nltk.download('punkt')
nltk.download('cmudict')
!pip install enchant

Requirement already satisfied: pyenchant in /usr/local/lib/python3.10/dist-packages (3.2.2)
Requirement already satisfied: contractions in /usr/local/lib/python3.10/dist-packages (0.1.73)
Requirement already satisfied: g2p-en in /usr/local/lib/python3.10/dist-packages (2.1.0)
Requirement already satisfied: cmudict in /usr/local/lib/python3.10/dist-packages (1.0.13)
Requirement already satisfied: jiwer in /usr/local/lib/python3.10/dist-packages (3.0.3)
Requirement already satisfied: textsearch>=0.0.21 in /usr/local/lib/python3.10/dist-packages (from contractions) (0.0.24)
Requirement already satisfied: numpy>=1.13.1 in /usr/local/lib/python3.10/dist-packages (from g2p-en) (1.23.5)
Requirement already satisfied: nltk>=3.2.4 in /usr/local/lib/python3.10/dist-packages (from g2p-en) (3.8.1)
Requirement already satisfied: inflect>=0.3.1 in /usr/local/lib/python3.10/dist-packages (from g2p-en) (7.0.0)
Requirement already satisfied: distance>=0.1.3 in /usr/local/lib/python3.10/dist-packages (from g2p-en) (0.1.3)
Requirement already satisfied: importlib-metadata<6.0.0,>=5.1.0 in /usr/local/lib/python3.10/dist-packages (from cmudict) (5.2.0)
Requirement already satisfied: importlib-resources<6.0.0,>=5.10.1 in /usr/local/lib/python3.10/dist-packages (from cmudict) (5.13.0)
Requirement already satisfied: click<9.0.0,>=8.1.3 in /usr/local/lib/python3.10/dist-packages (from jiwer) (8.1.7)
Requirement already satisfied: rapidfuzz<4,>=3 in /usr/local/lib/python3.10/dist-packages (from jiwer) (3.3.0)
Requirement already satisfied: zipp>=0.5 in /usr/local/lib/python3.10/dist-packages (from importlib-metadata<6.0.0,>=5.1.0->cmudict) (3.16.2)
Requirement already satisfied: pydantic>=1.9.1 in /usr/local/lib/python3.10/dist-packages (from inflect>=0.3.1->g2p-en) (1.10.12)
Requirement already satisfied: typing-extensions in /usr/local/lib/python3.10/dist-packages (from inflect>=0.3.1->g2p-en) (4.5.0)
Requirement already satisfied: joblib in /usr/local/lib/python3.10/dist-packages (from nltk>=3.2.4->g2p-en) (1.3.2)
Requirement already satisfied: regex>=2021.8.3 in /usr/local/lib/python3.10/dist-packages (from nltk>=3.2.4->g2p-en) (2023.6.3)
Requirement already satisfied: tqdm in /usr/local/lib/python3.10/dist-packages (from nltk>=3.2.4->g2p-en) (4.66.1)
Requirement already satisfied: anyascii in /usr/local/lib/python3.10/dist-packages (from textsearch>=0.0.21->contractions) (0.3.2)
Requirement already satisfied: pyahocorasick in /usr/local/lib/python3.10/dist-packages (from textsearch>=0.0.21->contractions) (2.0.0)
Requirement already satisfied: nltk in /usr/local/lib/python3.10/dist-packages (3.8.1)
Requirement already satisfied: click in /usr/local/lib/python3.10/dist-packages (from nltk) (8.1.7)
Requirement already satisfied: joblib in /usr/local/lib/python3.10/dist-packages (from nltk) (1.3.2)
Requirement already satisfied: regex>=2021.8.3 in /usr/local/lib/python3.10/dist-packages (from nltk) (2023.6.3)
Requirement already satisfied: tqdm in /usr/local/lib/python3.10/dist-packages (from nltk) (4.66.1)

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package cmudict to /root/nltk_data...
[nltk_data]   Package cmudict is already up-to-date!

Requirement already satisfied: enchant in /usr/local/lib/python3.10/dist-packages (0.0.1)

from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).

from pathlib import Path
import re
import pandas as pd
import numpy as np
import random
import string
from collections import Counter
import matplotlib.pyplot as plt
from collections import defaultdict
from wordcloud import WordCloud
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import cmudict
import enchant
from g2p_en import G2p
import inflect
import contractions
import seaborn as sns
from re import match
from multiprocessing import Pool
from nltk.corpus import cmudict
import contractions
from itertools import chain
from IPython.display import display
import tensorflow as tf
from tensorflow.keras.layers import Input, Embedding, GRU, Dense, Dropout, Attention
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.utils import plot_model
import jiwer
from nltk.translate.bleu_score import corpus_bleu, SmoothingFunction

# Define the file path
path = Path("/content/drive/MyDrive/Dissertation/g_train.txt")

# Read the file content
with open(path, 'r', encoding='utf-8') as file:
    lines = file.readlines()

# Number of lines in the dataset
num_lines = len(lines)
print(f"Number of lines in the dataset are: {num_lines}")

# Display first few lines
print("\nThe first 5 lines are:")
for line in lines[:5]:
    print(line.strip())

# Total number of words in the dataset
total_words = sum(len(line.split()) for line in lines)
print(f"\nThe Total number of words are: {total_words}")

# Average number of words per line
avg_words_per_line = total_words / num_lines
print(f"The average number of words per line are: {avg_words_per_line:.2f}")

Number of lines in the dataset are: 45839

The first 5 lines are:
5535415699068794046/00001, WHEN YOU'RE COOKING CHIPS AT HOME
5535415699068794046/00002, THE TRADITIONAL CHIP PAN OFTEN STAYS ON THE SHELF
5535415699068794046/00003, THROUGH WHAT THEY CALL A KNIFE BLOCK
5535415699068794046/00004, WHICH INVOLVES FIRING A POTATO DOWN A PIPE
5535415699068794046/00006, APART FROM THE GOLDEN COLOUR AND THE DELICIOUS FLAVOUR

The Total number of words are: 375019
The average number of words per line are: 8.18

# Create inflect engine
p = inflect.engine()

# Function to convert numbers to words
def convert_numbers_to_words(text):
    words_with_numbers = []
    words = text.split()
    for i, word in enumerate(words):
        if word.isdigit():
            words_with_numbers.append((word, p.number_to_words(word)))
            words[i] = words_with_numbers[-1][1]
    return ' '.join(words), words_with_numbers

# Function to replace hyphens with spaces
def replace_hyphens_with_spaces(text):
    return text.replace('-', ' ')

lines = []
numbers_converted = []
with open(path, encoding='utf-8') as file:
    for line in file.readlines():
        cleaned_line, numbers = convert_numbers_to_words(re.sub(r'^\d+/\d+,\s*', '', line).strip())
        lines.append(cleaned_line)
        numbers_converted.extend(numbers)

# Convert to DataFrame and process
df = pd.DataFrame(lines, columns=['sentence'])
df['sentence'] = df['sentence'].apply(str.lstrip)
df['sentence'] = df['sentence'].apply(replace_hyphens_with_spaces)

# Print converted numbers
print("\nNumbers converted to words:")
for number, words in numbers_converted:
    print(f"{number}: {words.replace('-', ' ')}")

# Additional insights
print("\nSome additional insights:")
print("Average sentence length:", df['sentence'].str.split().apply(len).mean())
print("Max sentence length:", df['sentence'].str.split().apply(len).max())
print("Min sentence length:", df['sentence'].str.split().apply(len).min())
print("Unique words:", len(set(' '.join(df['sentence']).split())))

print("\nShape:", df.shape)

# Print DataFrame
print(df.head())

Numbers converted to words:
4: four
24: twenty four
1972: one thousand, nine hundred and seventy two
1: one
20: twenty
2: two
2012: two thousand and twelve
110: one hundred and ten
6: six
14: fourteen
31: thirty one
1964: one thousand, nine hundred and sixty four
1965: one thousand, nine hundred and sixty five
69: sixty nine
1: one
1966: one thousand, nine hundred and sixty six
67: sixty seven
1: one
10: ten
230: two hundred and thirty
1968: one thousand, nine hundred and sixty eight
1969: one thousand, nine hundred and sixty nine
80: eighty
17: seventeen
50: fifty
60: sixty
200: two hundred
180: one hundred and eighty
10: ten
20: twenty
195: one hundred and ninety five
249: two hundred and forty nine
300: three hundred
500: five hundred
350: three hundred and fifty
25: twenty five
20: twenty
65: sixty five
10: ten
2005: two thousand and five
000: zero
50: fifty
1: one
40: forty
60: sixty
23: twenty three
5: five
10: ten
10: ten
000: zero
180: one hundred and eighty
54: fifty four
1: one
300: three hundred
45: forty five
50: fifty
5: five
20: twenty
80: eighty
10: ten
100: one hundred
3: three
10: ten
21: twenty one
21: twenty one
400: four hundred
5: five
40: forty
35: thirty five
5: five
1: one
15: fifteen
250: two hundred and fifty
15: fifteen
25: twenty five
27: twenty seven
60: sixty
4: four
500: five hundred
6: six
6: six
20: twenty
68: sixty eight
7: seven
24: twenty four
2: two
3: three
2000: two thousand
40: forty
39: thirty nine
1940: one thousand, nine hundred and forty
75: seventy five
000: zero
100: one hundred
1: one
800: eight hundred
108: one hundred and eight
012: twelve
2012: two thousand and twelve
20: twenty
10: ten
85: eighty five
31: thirty one
300: three hundred
9: nine
3: three
150: one hundred and fifty
20: twenty
500: five hundred
000: zero
100: one hundred
1734: one thousand, seven hundred and thirty four
147: one hundred and forty seven
85: eighty five
55: fifty five
14: fourteen
15: fifteen
80: eighty
000: zero
90: ninety
200: two hundred
25: twenty five
300: three hundred
75: seventy five
10: ten
5: five
22: twenty two
150: one hundred and fifty
45: forty five
2001: two thousand and one
10: ten
6: six
300: three hundred
12: twelve
185: one hundred and eighty five
80: eighty
14: fourteen
30: thirty
40: forty
29: twenty nine
14: fourteen
300: three hundred
1: one
810: eight hundred and ten
18: eighteen
18: eighteen
40: forty
60: sixty
35: thirty five
000: zero
000: zero
9: nine
15: fifteen
17: seventeen
12: twelve
25: twenty five
30: thirty
21: twenty one
300: three hundred
10: ten
2009: two thousand and nine
45: forty five
30: thirty
59: fifty nine
7: seven
24: twenty four
5: five
000: zero
30: thirty
86: eighty six
32: thirty two
10: ten
5: five
70: seventy
100: one hundred
50: fifty
28: twenty eight
65: sixty five
45: forty five
30: thirty
40: forty
000: zero
1642: one thousand, six hundred and forty two
65: sixty five
12: twelve
1527: one thousand, five hundred and twenty seven
150: one hundred and fifty
250: two hundred and fifty
60: sixty
1900: one thousand, nine hundred
140: one hundred and forty
10: ten
12: twelve
13: thirteen
35: thirty five
12: twelve
40: forty
50: fifty
50: fifty
50: fifty
9: nine
5: five
1: one
125: one hundred and twenty five
5: five
28: twenty eight
24: twenty four
000: zero
18: eighteen
78: seventy eight
100: one hundred
10: ten
100: one hundred
18: eighteen
25: twenty five
1979: one thousand, nine hundred and seventy nine
33: thirty three
100: one hundred
100: one hundred
50: fifty
80: eighty
20: twenty
30: thirty
1: one
1709: one thousand, seven hundred and nine
1710: one thousand, seven hundred and ten
30: thirty
50: fifty
400: four hundred
218: two hundred and eighteen
97: ninety seven
24: twenty four
28: twenty eight
4: four
520: five hundred and twenty
32: thirty two
2013: two thousand and thirteen
2013: two thousand and thirteen
10: ten
160: one hundred and sixty
250: two hundred and fifty
2012: two thousand and twelve
78: seventy eight
2010: two thousand and ten
1980: one thousand, nine hundred and eighty
600: six hundred
500: five hundred
1: one
000: zero
30: thirty
400: four hundred
10: ten
200: two hundred
27: twenty seven
5: five
14: fourteen
90: ninety
25: twenty five
20: twenty
41: forty one
25: twenty five
100: one hundred
150: one hundred and fifty
20: twenty
30: thirty
25: twenty five
60: sixty
41: forty one
40: forty
47: forty seven
20: twenty
50: fifty
60: sixty
50: fifty
400: four hundred
1: one
20: twenty
140: one hundred and forty
1: one
80: eighty
20: twenty
95: ninety five
18: eighteen
500: five hundred
40: forty
50: fifty
500: five hundred
1: one
2015: two thousand and fifteen
100: one hundred
2: two
190: one hundred and ninety
400: four hundred
5: five
10: ten
50: fifty
5: five
36: thirty six
000: zero
67: sixty seven
100: one hundred
300: three hundred
125: one hundred and twenty five
30: thirty
11: eleven
16: sixteen
24: twenty four
2: two
4: four
3: three
30: thirty
2010: two thousand and ten
2010: two thousand and ten
2011: two thousand and eleven
1: one
20: twenty
40: forty
40: forty
8: eight
30: thirty
1947: one thousand, nine hundred and forty seven
60: sixty
5: five
60: sixty
12: twelve
12: twelve
1: one
12: twelve
1: one
12: twelve
2: two
2: two
12: twelve
12: twelve
12: twelve
2: two
100: one hundred
50: fifty
100: one hundred
1: one
12: twelve
000: zero
12: twelve
6: six
12: twelve
2000: two thousand
12: twelve
999: nine hundred and ninety nine
14: fourteen
12: twelve
000: zero
4: four
12: twelve
1: one
12: twelve
12: twelve
14: fourteen
12: twelve
000: zero
500: five hundred
500: five hundred
000: zero
10: ten
12: twelve
1: one
12: twelve
12: twelve
300: three hundred
500: five hundred
7: seven
000: zero
12: twelve
12: twelve
2: two
12: twelve
2011: two thousand and eleven
000: zero
14: fourteen
000: zero
63: sixty three
64: sixty four
180: one hundred and eighty
30: thirty
11: eleven
35: thirty five
8: eight
600: six hundred
95: ninety five
000: zero
450: four hundred and fifty
475: four hundred and seventy five
5: five
37: thirty seven
31: thirty one
35: thirty five
65: sixty five
000: zero
20: twenty
1835: one thousand, eight hundred and thirty five
54: fifty four
67: sixty seven
000: zero
000: zero
32: thirty two
000: zero
30: thirty
40: forty
58: fifty eight
000: zero
960: nine hundred and sixty
24: twenty four
27: twenty seven
000: zero
32: thirty two
89: eighty nine
70: seventy
48: forty eight
000: zero
62: sixty two
58: fifty eight
000: zero
12: twelve
69: sixty nine
160: one hundred and sixty
145: one hundred and forty five
500: five hundred
100: one hundred
93: ninety three
500: five hundred
30: thirty
50: fifty
1: one
13: thirteen
200: two hundred
000: zero
6: six
600: six hundred
63: sixty three
400: four hundred
60: sixty
1: one
1: one
000: zero
1: one
1: one
300: three hundred
25: twenty five
000: zero
24: twenty four
90: ninety
110: one hundred and ten
1935: one thousand, nine hundred and thirty five
30: thirty
50: fifty
800: eight hundred
100: one hundred
150: one hundred and fifty
250: two hundred and fifty
22: twenty two
200: two hundred
35: thirty five
2012: two thousand and twelve
1851: one thousand, eight hundred and fifty one
10: ten
24: twenty four
12: twelve
1943: one thousand, nine hundred and forty three
60: sixty
617: six hundred and seventeen
1533: one thousand, five hundred and thirty three
200: two hundred
1537: one thousand, five hundred and thirty seven
56: fifty six
26: twenty six
14: fourteen
2012: two thousand and twelve
20: twenty
30: thirty
21: twenty one
99: ninety nine
15: fifteen
12: twelve
900: nine hundred
1: one
1940: one thousand, nine hundred and forty
53: fifty three
400: four hundred
40: forty
1: one
2: two
52: fifty two
28: twenty eight
25: twenty five
48: forty eight
2008: two thousand and eight
60: sixty
75: seventy five
80: eighty
8: eight
5: five
100: one hundred
62: sixty two
12: twelve
11: eleven
371: three hundred and seventy one
371: three hundred and seventy one
000: zero
11: eleven
80: eighty
23: twenty three
40: forty
000: zero
6: six
2003: two thousand and three
93: ninety three
6: six
20: twenty
2003: two thousand and three
205: two hundred and five
69: sixty nine
33: thirty three
000: zero
59: fifty nine
56: fifty six
000: zero
40: forty
15: fifteen
70: seventy
50: fifty
60: sixty
29: twenty nine
45: forty five
80: eighty
100: one hundred
40: forty
8: eight
30: thirty
1: one
12: twelve
000: zero
55: fifty five
000: zero
10: ten
135: one hundred and thirty five
160: one hundred and sixty
110: one hundred and ten
90: ninety
15: fifteen
150: one hundred and fifty
16: sixteen
170: one hundred and seventy
20: twenty
54: fifty four
195: one hundred and ninety five
10: ten
450: four hundred and fifty
20: twenty
450: four hundred and fifty
400: four hundred
1: one
12: twelve
8: eight
200: two hundred
2: two
1: one
2010: two thousand and ten
25: twenty five
46: forty six
000: zero
62: sixty two
59: fifty nine
1995: one thousand, nine hundred and ninety five
000: zero
51: fifty one
56: fifty six
000: zero
000: zero
24: twenty four
66: sixty six
36: thirty six
57: fifty seven
000: zero
200: two hundred
1: one
76: seventy six
40: forty
20: twenty
16: sixteen
2012: two thousand and twelve
25: twenty five
50: fifty
14: fourteen
2013: two thousand and thirteen
61: sixty one
22: twenty two
11: eleven
66: sixty six
67: sixty seven
1846: one thousand, eight hundred and forty six
75: seventy five
120: one hundred and twenty
75: seventy five
5: five
5: five
11: eleven
3: three
50: fifty
2013: two thousand and thirteen
20: twenty
40: forty
3: three
35: thirty five
24: twenty four
12: twelve
10: ten
18: eighteen
18: eighteen
65: sixty five
100: one hundred
50: fifty
57: fifty seven
300: three hundred
50: fifty
8: eight
75: seventy five
30: thirty
50: fifty
28: twenty eight
10: ten
20: twenty
400: four hundred
500: five hundred
150: one hundred and fifty
7: seven
18: eighteen
26: twenty six
1984: one thousand, nine hundred and eighty four
462: four hundred and sixty two
2014: two thousand and fourteen
72: seventy two
1820: one thousand, eight hundred and twenty
2: two
1754: one thousand, seven hundred and fifty four
10: ten
10: ten
60: sixty
75: seventy five
94: ninety four
50: fifty
100: one hundred
370: three hundred and seventy
40: forty
50: fifty
51: fifty one
200: two hundred
1: one
70: seventy
100: one hundred
200: two hundred
1893: one thousand, eight hundred and ninety three
1991: one thousand, nine hundred and ninety one
100: one hundred
25: twenty five
100: one hundred
30: thirty
10: ten
1: one
200: two hundred
300: three hundred
4: four
120: one hundred and twenty
180: one hundred and eighty
230: two hundred and thirty
1946: one thousand, nine hundred and forty six
20: twenty
200: two hundred
400: four hundred
200: two hundred
500: five hundred
800: eight hundred
10: ten
20: twenty
200: two hundred
70: seventy
10: ten
30: thirty
10: ten
2: two
150: one hundred and fifty
31: thirty one
30: thirty
50: fifty
20: twenty
30: thirty
40: forty
10: ten
500: five hundred
12: twelve
30: thirty
25: twenty five
30: thirty
139: one hundred and thirty nine
25: twenty five
10: ten
20: twenty
638: six hundred and thirty eight
13: thirteen
10: ten
12: twelve
12: twelve
28: twenty eight
35: thirty five
60: sixty
100: one hundred
5: five
40: forty
12: twelve
25: twenty five
25: twenty five
9: nine
5: five
15: fifteen
25: twenty five
000: zero
150: one hundred and fifty
100: one hundred
150: one hundred and fifty
100: one hundred
150: one hundred and fifty
30: thirty
45: forty five
930: nine hundred and thirty
9: nine
4: four
1: one
48: forty eight
400: four hundred
2: two
707: seven hundred and seven
707: seven hundred and seven
50: fifty
2: two
50: fifty
2: two
350: three hundred and fifty
14: fourteen
24: twenty four
25: twenty five
30: thirty
12: twelve
50: fifty
2: two
975: nine hundred and seventy five
300: three hundred
200: two hundred
2: two
600: six hundred
000: zero
5: five
12: twelve
45: forty five
120: one hundred and twenty
800: eight hundred
1: one
38: thirty eight
1: one
2012: two thousand and twelve
1: one
30: thirty
50: fifty
60: sixty
5: five
300: three hundred
200: two hundred
1947: one thousand, nine hundred and forty seven
52: fifty two
90: ninety
2000: two thousand
40: forty
100: one hundred
000: zero
95: ninety five
40: forty
14: fourteen
2: two
79: seventy nine
45: forty five
100: one hundred
500: five hundred
600: six hundred
100: one hundred
2: two
194: one hundred and ninety four
15: fifteen
1779: one thousand, seven hundred and seventy nine
60: sixty
25: twenty five
200: two hundred
250: two hundred and fifty
30: thirty
600: six hundred
100: one hundred
30: thirty
50: fifty
80: eighty
65: sixty five
25: twenty five
20: twenty
180: one hundred and eighty
250: two hundred and fifty
50: fifty
5: five
000: zero
10: ten
200: two hundred
1800: one thousand, eight hundred
1910: one thousand, nine hundred and ten
10: ten
12: twelve
75: seventy five
100: one hundred
15: fifteen
70: seventy
60: sixty
99: ninety nine
45: forty five
5: five
000: zero
40: forty
60: sixty
12: twelve
15: fifteen
400: four hundred
600: six hundred
1: one
853: eight hundred and fifty three
46: forty six
500: five hundred
25: twenty five
1888: one thousand, eight hundred and eighty eight
250: two hundred and fifty
10: ten
5: five
1958: one thousand, nine hundred and fifty eight
85: eighty five
600: six hundred
800: eight hundred
2006: two thousand and six
15: fifteen
200: two hundred
300: three hundred
38: thirty eight
28: twenty eight
100: one hundred
60: sixty
300: three hundred
300: three hundred
75: seventy five
1899: one thousand, eight hundred and ninety nine
300: three hundred
2: two
200: two hundred
21: twenty one
100: one hundred
40: forty
125: one hundred and twenty five
24: twenty four
50: fifty
23: twenty three
13: thirteen
10: ten
120: one hundred and twenty
50: fifty
38: thirty eight
60: sixty
30: thirty
150: one hundred and fifty
12: twelve
2: two
32: thirty two
2012: two thousand and twelve
800: eight hundred
1907: one thousand, nine hundred and seven
50: fifty
50: fifty
35: thirty five
25: twenty five
1: one
90: ninety
450: four hundred and fifty
900: nine hundred
400: four hundred
500: five hundred
750: seven hundred and fifty
58: fifty eight
370: three hundred and seventy
42: forty two
10: ten
150: one hundred and fifty
50: fifty
1901: one thousand, nine hundred and one
5: five
10: ten
30: thirty
20: twenty
4: four
1985: one thousand, nine hundred and eighty five
175: one hundred and seventy five
12: twelve
11: eleven
60: sixty
12: twelve
100: one hundred
100: one hundred
500: five hundred
11: eleven
100: one hundred
300: three hundred
50: fifty
48: forty eight
30: thirty
1924: one thousand, nine hundred and twenty four
160: one hundred and sixty
80: eighty
24: twenty four
15: fifteen
600: six hundred
2: two
1: one
20: twenty
40: forty
50: fifty
15: fifteen
10: ten
10: ten
25: twenty five
70: seventy
20: twenty
100: one hundred
200: two hundred
15: fifteen
34: thirty four
20: twenty
20: twenty
26: twenty six
3: three
32: thirty two
20: twenty
5: five
60: sixty
800: eight hundred
100: one hundred
20: twenty
65: sixty five
50: fifty
20: twenty
10: ten
150: one hundred and fifty
2: two
63: sixty three
33: thirty three
100: one hundred
000: zero
000: zero
500: five hundred
90: ninety
50: fifty
20: twenty
1900: one thousand, nine hundred
10: ten
200: two hundred
17: seventeen
30: thirty
24: twenty four
120: one hundred and twenty
100: one hundred
100: one hundred
1: one
120: one hundred and twenty
27: twenty seven
1934: one thousand, nine hundred and thirty four
673: six hundred and seventy three
29: twenty nine
30: thirty
6: six
600: six hundred
200: two hundred
62: sixty two
100: one hundred
3: three
180: one hundred and eighty
142: one hundred and forty two
100: one hundred
1958: one thousand, nine hundred and fifty eight
25: twenty five
16: sixteen
300: three hundred
400: four hundred
000: zero
30: thirty
10: ten
12: twelve
24: twenty four
300: three hundred
40: forty
80: eighty
400: four hundred
8: eight
200: two hundred
300: three hundred
800: eight hundred
12: twelve
000: zero
20: twenty
7: seven
0: zero
40: forty
40: forty
75: seventy five
20: twenty
4: four
4: four
29: twenty nine
1770: one thousand, seven hundred and seventy
000: zero
627: six hundred and twenty seven
465: four hundred and sixty five
375: three hundred and seventy five
385: three hundred and eighty five
3: three
2011: two thousand and eleven
1942: one thousand, nine hundred and forty two
60: sixty
7: seven
18: eighteen
000: zero
2001: two thousand and one
11: eleven
8: eight
5: five
180: one hundred and eighty
30: thirty
75: seventy five
1993: one thousand, nine hundred and ninety three
1978: one thousand, nine hundred and seventy eight
18: eighteen
20: twenty
3: three
1: one
24: twenty four
20: twenty
24: twenty four
150: one hundred and fifty
15: fifteen
1850: one thousand, eight hundred and fifty
1035: one thousand and thirty five
100: one hundred
000: zero
20: twenty
1887: one thousand, eight hundred and eighty seven
26: twenty six
15: fifteen
155: one hundred and fifty five
30: thirty
240: two hundred and forty
15: fifteen
115: one hundred and fifteen
10: ten
1: one
10: ten
94: ninety four
24: twenty four
2: two
2: two
300: three hundred
450: four hundred and fifty
65: sixty five
100: one hundred
100: one hundred
195: one hundred and ninety five
300: three hundred
165: one hundred and sixty five
37: thirty seven
1: one
28: twenty eight
1814: one thousand, eight hundred and fourteen
200: two hundred
100: one hundred
75: seventy five
100: one hundred
450: four hundred and fifty
32: thirty two
20: twenty
246: two hundred and forty six
270: two hundred and seventy
400: four hundred
125: one hundred and twenty five
380: three hundred and eighty
125: one hundred and twenty five
200: two hundred
250: two hundred and fifty
21: twenty one
1: one
450: four hundred and fifty
40: forty
15: fifteen
170: one hundred and seventy
15: fifteen
700: seven hundred
9: nine
200: two hundred
400: four hundred
75: seventy five
600: six hundred
300: three hundred
170: one hundred and seventy
10: ten
2014: two thousand and fourteen
000: zero
3: three
000: zero
30: thirty
1948: one thousand, nine hundred and forty eight
15: fifteen
50: fifty
20: twenty
18: eighteen
18: eighteen
32: thirty two
100: one hundred
1: one
500: five hundred
1338: one thousand, three hundred and thirty eight
12: twelve
24: twenty four
000: zero
10: ten
1545: one thousand, five hundred and forty five
25: twenty five
2008: two thousand and eight
500: five hundred
112: one hundred and twelve
16: sixteen
35: thirty five
500: five hundred
10: ten
4: four
700: seven hundred
17: seventeen
17: seventeen
2: two
8: eight
17: seventeen
1: one
100: one hundred
2014: two thousand and fourteen
515: five hundred and fifteen
210: two hundred and ten
125: one hundred and twenty five
550: five hundred and fifty
625: six hundred and twenty five
430: four hundred and thirty
150: one hundred and fifty
15: fifteen
75: seventy five
190: one hundred and ninety
133: one hundred and thirty three
295: two hundred and ninety five
37: thirty seven
000: zero
65: sixty five
425: four hundred and twenty five
147: one hundred and forty seven
8: eight
10: ten
62: sixty two
110: one hundred and ten
15: fifteen
20: twenty
20: twenty
21: twenty one
14: fourteen
3: three
25: twenty five
30: thirty
100: one hundred
110: one hundred and ten
800: eight hundred
200: two hundred
000: zero
20: twenty
1: one
400: four hundred
12: twelve
24: twenty four
6000: six thousand
135: one hundred and thirty five
20: twenty
30: thirty
50: fifty
15: fifteen
40: forty
000: zero
1905: one thousand, nine hundred and five
100: one hundred
3: three
100: one hundred
100: one hundred
50: fifty
2014: two thousand and fourteen
1984: one thousand, nine hundred and eighty four
2014: two thousand and fourteen
14: fourteen
87: eighty seven
3: three
20: twenty
65: sixty five
170: one hundred and seventy
11: eleven
2001: two thousand and one
96: ninety six
20: twenty
1945: one thousand, nine hundred and forty five
1965: one thousand, nine hundred and sixty five
1739: one thousand, seven hundred and thirty nine
28: twenty eight
161: one hundred and sixty one
35: thirty five
000: zero
14: fourteen
70: seventy
10: ten
20: twenty
40: forty
1606: one thousand, six hundred and six
10: ten
20: twenty
25: twenty five
300: three hundred
500: five hundred
200: two hundred
1996: one thousand, nine hundred and ninety six
95: ninety five
18: eighteen
59: fifty nine
23: twenty three
18: eighteen
12: twelve
21: twenty one
1976: one thousand, nine hundred and seventy six
25: twenty five
9: nine
458: four hundred and fifty eight
40: forty
11: eleven
15: fifteen
11: eleven
75: seventy five
15: fifteen
15: fifteen
20: twenty
1: one
10: ten
20: twenty
40: forty
85: eighty five
35: thirty five
5: five
1949: one thousand, nine hundred and forty nine
20: twenty
40: forty
11: eleven
70: seventy
30: thirty
000: zero
50: fifty
000: zero
65: sixty five
2016: two thousand and sixteen
28: twenty eight
100: one hundred
150: one hundred and fifty
2050: two thousand and fifty
100: one hundred
1963: one thousand, nine hundred and sixty three
20: twenty
80: eighty
100: one hundred
20: twenty
17: seventeen
000: zero
300: three hundred
46: forty six
700: seven hundred
1527: one thousand, five hundred and twenty seven
82: eighty two
60: sixty
18: eighteen
20: twenty
55: fifty five
1958: one thousand, nine hundred and fifty eight
16: sixteen
13: thirteen
12: twelve
000: zero
100: one hundred
14: fourteen
30: thirty
155: one hundred and fifty five
50: fifty
22: twenty two
000: zero
000: zero
40: forty
100: one hundred
150: one hundred and fifty
52: fifty two
11: eleven
24: twenty four
300: three hundred
400: four hundred
30: thirty
40: forty
30: thirty
27: twenty seven
46: forty six
300: three hundred
10: ten
4: four
7: seven
40: forty
5: five
5: five
80: eighty
80: eighty
150: one hundred and fifty
76: seventy six
20: twenty
2013: two thousand and thirteen
40: forty
20: twenty
930: nine hundred and thirty
1800: one thousand, eight hundred
680: six hundred and eighty
500: five hundred
80: eighty
90: ninety
95: ninety five
50: fifty
11: eleven
24: twenty four
300: three hundred
3: three
000: zero
89: eighty nine
20: twenty
40: forty
5: five
191: one hundred and ninety one
17: seventeen
1963: one thousand, nine hundred and sixty three
40: forty
60: sixty
40: forty
50: fifty
4: four
96: ninety six
0: zero
150: one hundred and fifty
180: one hundred and eighty
5: five
200: two hundred
300: three hundred
26: twenty six
11: eleven
15: fifteen
20: twenty
200: two hundred
120: one hundred and twenty
1: one
20: twenty
90: ninety
100: one hundred
200: two hundred
300: three hundred
100: one hundred
10: ten
20: twenty
82: eighty two
20: twenty
200: two hundred
100: one hundred
200: two hundred
100: one hundred
30: thirty
60: sixty
200: two hundred
200: two hundred
3: three
97: ninety seven
5: five
1: one
5: five
85: eighty five
40: forty
15: fifteen
20: twenty
2: two
4: four
2: two
500: five hundred
20: twenty
20: twenty
25: twenty five
30: thirty
27: twenty seven
000: zero
50: fifty
12: twelve
1: one
12: twelve
1: one
12: twelve
30: thirty
1: one
2: two
000: zero
520: five hundred and twenty
30: thirty
18: eighteen
18: eighteen
16: sixteen
18: eighteen
15: fifteen
20: twenty
7: seven
460: four hundred and sixty
1: one
30: thirty
3: three
2: two
12: twelve
2022: two thousand and twenty two
63: sixty three
3: three
15: fifteen
10: ten
2010: two thousand and ten
1923: one thousand, nine hundred and twenty three
11: eleven
25: twenty five
21: twenty one
100: one hundred
140: one hundred and forty
150: one hundred and fifty
200: two hundred
90: ninety
75: seventy five
20: twenty
16: sixteen
1723: one thousand, seven hundred and twenty three
150: one hundred and fifty
100: one hundred
200: two hundred
50: fifty
100: one hundred
90: ninety
200: two hundred
300: three hundred
30: thirty
24: twenty four
5: five
000: zero
000: zero
1: one
70: seventy
2: two
84: eighty four
1988: one thousand, nine hundred and eighty eight
15: fifteen
150: one hundred and fifty
29: twenty nine
38: thirty eight
1991: one thousand, nine hundred and ninety one
40: forty
55: fifty five
15: fifteen
20: twenty
47: forty seven
28: twenty eight
30: thirty
25: twenty five
98: ninety eight
230: two hundred and thirty
11: eleven
30: thirty
28: twenty eight
1605: one thousand, six hundred and five
40: forty
16: sixteen
80: eighty
120: one hundred and twenty
8: eight
60: sixty
20: twenty
30: thirty
1989: one thousand, nine hundred and eighty nine
20: twenty
18: eighteen
1987: one thousand, nine hundred and eighty seven
1926: one thousand, nine hundred and twenty six
13: thirteen
14: fourteen
250: two hundred and fifty
150: one hundred and fifty
400: four hundred
500: five hundred
120: one hundred and twenty
40: forty
40: forty
7: seven
9: nine
12: twelve
30: thirty
90: ninety
15: fifteen
150: one hundred and fifty
100: one hundred
180: one hundred and eighty
000: zero
175: one hundred and seventy five
150: one hundred and fifty
1828: one thousand, eight hundred and twenty eight
70: seventy
56: fifty six
10: ten
15: fifteen
20: twenty
130: one hundred and thirty
15: fifteen
20: twenty
12: twelve
330: three hundred and thirty
400: four hundred
100: one hundred
100: one hundred
100: one hundred
1911: one thousand, nine hundred and eleven
10: ten
340: three hundred and forty
75: seventy five
60: sixty
1828: one thousand, eight hundred and twenty eight
70: seventy
14: fourteen
15: fifteen
1651: one thousand, six hundred and fifty one
95: ninety five
20: twenty
50: fifty
60: sixty
520: five hundred and twenty
750: seven hundred and fifty
1948: one thousand, nine hundred and forty eight
800: eight hundred
20: twenty
25: twenty five
25: twenty five
60: sixty
500: five hundred
000: zero
58: fifty eight
20: twenty
750: seven hundred and fifty
90: ninety
10: ten
15: fifteen
1: one
12: twelve
000: zero
000: zero
20: twenty
24: twenty four
34: thirty four
729: seven hundred and twenty nine
20: twenty
10: ten
100: one hundred
130: one hundred and thirty
26: twenty six
000: zero
800: eight hundred
37: thirty seven
34: thirty four
90: ninety
15: fifteen
465: four hundred and sixty five
25: twenty five
000: zero
800: eight hundred
15: fifteen
3: three
50: fifty
2: two
18: eighteen
400: four hundred
415: four hundred and fifteen
30: thirty
19: nineteen
23: twenty three
400: four hundred
1706: one thousand, seven hundred and six
16: sixteen
20: twenty
200: two hundred
21: twenty one
500: five hundred
125: one hundred and twenty five
45: forty five
28: twenty eight
29: twenty nine
54: fifty four
16: sixteen
6: six
60: sixty
20: twenty
67: sixty seven
22: twenty two
60: sixty
000: zero
15: fifteen
200: two hundred
5: five
600: six hundred
15: fifteen
1933: one thousand, nine hundred and thirty three
10: ten
40: forty
90: ninety
000: zero
100: one hundred
000: zero
55: fifty five
400: four hundred
30: thirty
000: zero
246: two hundred and forty six
68: sixty eight
30: thirty
135: one hundred and thirty five
160: one hundred and sixty
175: one hundred and seventy five
000: zero
000: zero
2006: two thousand and six
57: fifty seven
185: one hundred and eighty five
125: one hundred and twenty five
205: two hundred and five
950: nine hundred and fifty
60: sixty
125: one hundred and twenty five
450: four hundred and fifty
385: three hundred and eighty five
5: five
7: seven
3: three
31: thirty one
80: eighty
69: sixty nine
8: eight
000: zero
350: three hundred and fifty
400: four hundred
550: five hundred and fifty
600: six hundred
650: six hundred and fifty
000: zero
000: zero
165: one hundred and sixty five
15: fifteen
90: ninety
20: twenty
10: ten
85: eighty five
100: one hundred
2: two
120: one hundred and twenty
13: thirteen
260: two hundred and sixty
27: twenty seven
61: sixty one
300: three hundred
165: one hundred and sixty five
175: one hundred and seventy five
50: fifty
225: two hundred and twenty five
35: thirty five
35: thirty five
42: forty two
500: five hundred
70: seventy
700: seven hundred
245: two hundred and forty five
90: ninety
120: one hundred and twenty
200: two hundred
65: sixty five
50: fifty
2: two
60: sixty
8: eight
425: four hundred and twenty five
225: two hundred and twenty five
1: one
200: two hundred
99: ninety nine
50: fifty
000: zero
400: four hundred
500: five hundred
100: one hundred
300: three hundred
500: five hundred
99: ninety nine
3: three
1: one
35: thirty five
270: two hundred and seventy
32: thirty two
32: thirty two
30: thirty
20: twenty
130: one hundred and thirty
300: three hundred
23: twenty three
12: twelve
22: twenty two
20: twenty
22: twenty two
29: twenty nine
000: zero
60: sixty
10: ten
15: fifteen
239: two hundred and thirty nine
14: fourteen
250: two hundred and fifty
16: sixteen
350: three hundred and fifty
370: three hundred and seventy
350: three hundred and fifty
000: zero
99: ninety nine
53: fifty three
5: five
75: seventy five
40: forty
40: forty
30: thirty
2017: two thousand and seventeen
2: two
16: sixteen
18: eighteen
5: five
10: ten
20: twenty
6: six
15: fifteen
20: twenty
000: zero
1918: one thousand, nine hundred and eighteen
1923: one thousand, nine hundred and twenty three
53: fifty three
55: fifty five
130: one hundred and thirty
12: twelve
69: sixty nine
160: one hundred and sixty
36: thirty six
200: two hundred
100: one hundred
150: one hundred and fifty
95: ninety five
9: nine
155: one hundred and fifty five
10: ten
25: twenty five
30: thirty
235: two hundred and thirty five
125: one hundred and twenty five
110: one hundred and ten
1981: one thousand, nine hundred and eighty one
12: twelve
50: fifty
10: ten
1: one
1967: one thousand, nine hundred and sixty seven
90: ninety
400: four hundred
40: forty
4: four
000: zero
50: fifty
30: thirty
100: one hundred
25: twenty five
8: eight
1745: one thousand, seven hundred and forty five
000: zero
8: eight
5: five
2016: two thousand and sixteen
10: ten
1: one
15: fifteen
80: eighty
35: thirty five
20: twenty
100: one hundred
50: fifty
40: forty
10: ten
10: ten
620: six hundred and twenty
12: twelve
350: three hundred and fifty
500: five hundred
14: fourteen
10: ten
11: eleven
11: eleven
10: ten
647: six hundred and forty seven
101: one hundred and one
30: thirty
200: two hundred
200: two hundred
300: three hundred
140: one hundred and forty
1918: one thousand, nine hundred and eighteen
1961: one thousand, nine hundred and sixty one
1940: one thousand, nine hundred and forty
18: eighteen
250: two hundred and fifty
165: one hundred and sixty five
85: eighty five
42: forty two
820: eight hundred and twenty
44: forty four
250: two hundred and fifty
1759: one thousand, seven hundred and fifty nine
100: one hundred
15: fifteen
60: sixty
20: twenty
30: thirty
23: twenty three
13: thirteen
7: seven
000: zero
000: zero
9000: nine thousand
966: nine hundred and sixty six
100: one hundred
55: fifty five
15: fifteen
500: five hundred
14: fourteen
65: sixty five
96: ninety six
5: five
20: twenty
35: thirty five
200: two hundred
30: thirty
1940: one thousand, nine hundred and forty
15: fifteen
18: eighteen
5: five
30: thirty
20: twenty
100: one hundred
14: fourteen
95: ninety five
4: four
2: two
1960: one thousand, nine hundred and sixty
1963: one thousand, nine hundred and sixty three
57: fifty seven
900: nine hundred
6: six
90: ninety
40: forty
000: zero
2: two
30: thirty
000: zero
2009: two thousand and nine
10: ten
10: ten
40: forty
60: sixty
25: twenty five
35: thirty five
78: seventy eight
1826: one thousand, eight hundred and twenty six
12: twelve
6: six
25: twenty five
27: twenty seven
1: one
300: three hundred
400: four hundred
100: one hundred
60: sixty
425: four hundred and twenty five
000: zero
10: ten
13: thirteen
425: four hundred and twenty five
6: six
100: one hundred
70: seventy
6: six
100: one hundred
17: seventeen
100: one hundred
120: one hundred and twenty
50: fifty
5: five
12: twelve
75: seventy five
10: ten
209: two hundred and nine
50: fifty
260: two hundred and sixty
260: two hundred and sixty
160: one hundred and sixty
14: fourteen
50: fifty
26: twenty six
18: eighteen
8: eight
70: seventy
20: twenty
500: five hundred
1: one
3: three
12: twelve
1878: one thousand, eight hundred and seventy eight
30: thirty
40: forty
6: six
2015: two thousand and fifteen
7: seven
12: twelve
1977: one thousand, nine hundred and seventy seven
30: thirty
1954: one thousand, nine hundred and fifty four
20: twenty
12: twelve
2015: two thousand and fifteen
2: two
54: fifty four
10: ten
24: twenty four
300: three hundred
218: two hundred and eighteen
35: thirty five
1951: one thousand, nine hundred and fifty one
20: twenty
90: ninety
90: ninety
15: fifteen
854: eight hundred and fifty four
1985: one thousand, nine hundred and eighty five
20: twenty
000: zero
30: thirty
3: three
3: three
5: five
49: forty nine
350: three hundred and fifty
100: one hundred
200: two hundred
105: one hundred and five
12: twelve
1: one
5: five
400: four hundred
5: five
1: one
2: two
000: zero
2011: two thousand and eleven
1911: one thousand, nine hundred and eleven
1967: one thousand, nine hundred and sixty seven
15: fifteen
11: eleven
2016: two thousand and sixteen
646: six hundred and forty six
2: two
2: two
1973: one thousand, nine hundred and seventy three
65: sixty five
100: one hundred
150: one hundred and fifty
600: six hundred
400: four hundred
500: five hundred
1994: one thousand, nine hundred and ninety four
17: seventeen
30: thirty
15: fifteen
200: two hundred
300: three hundred
15: fifteen
2016: two thousand and sixteen
50: fifty
50: fifty
100: one hundred
520: five hundred and twenty
150: one hundred and fifty
2300: two thousand, three hundred
24: twenty four
15: fifteen
40: forty
10: ten
200: two hundred
1: one
700: seven hundred
200: two hundred
18: eighteen
15: fifteen
20: twenty
14: fourteen
69: sixty nine
3: three
200: two hundred
25: twenty five
930: nine hundred and thirty
10: ten
2016: two thousand and sixteen
2016: two thousand and sixteen
2016: two thousand and sixteen
180: one hundred and eighty
13: thirteen
7: seven
1852: one thousand, eight hundred and fifty two
1: one
35: thirty five
150: one hundred and fifty
42: forty two
2: two
100: one hundred
70: seventy
100: one hundred
20: twenty
30: thirty
4: four
60: sixty
100: one hundred
100: one hundred
700: seven hundred
20: twenty
000: zero
70: seventy
1: one
20: twenty
65: sixty five
70: seventy
20: twenty
20: twenty
100: one hundred
000: zero
30: thirty
110: one hundred and ten
22: twenty two
24: twenty four
120: one hundred and twenty
10: ten
12: twelve
14: fourteen
5: five
31: thirty one
12: twelve
7: seven
9: nine
2: two
74: seventy four
1: one
2009: two thousand and nine
40: forty
1969: one thousand, nine hundred and sixty nine
1998: one thousand, nine hundred and ninety eight
14: fourteen
1973: one thousand, nine hundred and seventy three
000: zero
100: one hundred
25: twenty five
20: twenty
48: forty eight
17: seventeen
24: twenty four
4: four
000: zero
2: two
55: fifty five
4: four
24: twenty four
28: twenty eight
27: twenty seven
27: twenty seven
30: thirty
30: thirty
12: twelve
600: six hundred
500: five hundred
1835: one thousand, eight hundred and thirty five
22: twenty two
20: twenty
45: forty five
60: sixty
89: eighty nine
40: forty
0: zero
12: twelve
10: ten
1826: one thousand, eight hundred and twenty six
000: zero
5: five
1848: one thousand, eight hundred and forty eight
14: fourteen
16: sixteen
11: eleven
800: eight hundred
300: three hundred
000: zero
100: one hundred
600: six hundred
125: one hundred and twenty five
20: twenty
85: eighty five
40: forty
8: eight
18: eighteen
73: seventy three
40: forty
2: two
1948: one thousand, nine hundred and forty eight
80: eighty
500: five hundred
200: two hundred
400: four hundred
1948: one thousand, nine hundred and forty eight
20: twenty
1790: one thousand, seven hundred and ninety
1984: one thousand, nine hundred and eighty four
2: two
000: zero
400: four hundred
150: one hundred and fifty
15: fifteen
25: twenty five
000: zero
1: one
000: zero
47: forty seven
22: twenty two
5: five
40: forty
100: one hundred
40: forty
10: ten
2: two
1: one
49: forty nine
100: one hundred
350: three hundred and fifty
155: one hundred and fifty five
90: ninety
95: ninety five
000: zero
325: three hundred and twenty five
320: three hundred and twenty
325: three hundred and twenty five
10: ten
20: twenty
15: fifteen
30: thirty
80: eighty
60: sixty
15: fifteen
1: one
10: ten
20: twenty
2: two
1848: one thousand, eight hundred and forty eight
1842: one thousand, eight hundred and forty two
90: ninety
25: twenty five
30: thirty
10: ten

Some additional insights:
Average sentence length: 7.231702262265756
Max sentence length: 28
Min sentence length: 3
Unique words: 17388

Shape: (45839, 1)
                                            sentence
0                  WHEN YOU'RE COOKING CHIPS AT HOME
1  THE TRADITIONAL CHIP PAN OFTEN STAYS ON THE SHELF
2               THROUGH WHAT THEY CALL A KNIFE BLOCK
3         WHICH INVOLVES FIRING A POTATO DOWN A PIPE
4  APART FROM THE GOLDEN COLOUR AND THE DELICIOUS...

# Define a color palette
palette = {
    "histogram": "#2980B9",
    "bar1": "#3498DB",
    "bar2": "#E74C3C",
    "bar3": "#1ABC9C"
}

# 1. Distribution of Sentence Lengths
plt.figure(figsize=(12, 6))
sentence_lengths = df['sentence'].str.split().apply(len)
sns.histplot(sentence_lengths, bins=30, color=palette["histogram"], edgecolor='black', alpha=0.7)
plt.title('Distribution of Sentence Lengths', fontsize=15)
plt.xlabel('Sentence Length (words)', fontsize=12)
plt.ylabel('Frequency', fontsize=12)
plt.grid(axis='y', linestyle='--', alpha=0.5)
plt.tight_layout()
plt.show()

# 2. Proportion of Sentences with Numbers Converted vs. Total Sentences
plt.figure(figsize=(10, 6))
labels = ['Sentences with Numbers Converted', 'Other Sentences']
values = [len(numbers_converted), len(df) - len(numbers_converted)]
bars = plt.bar(labels, values, color=[palette["bar1"], palette["bar2"]])
for bar in bars:
    yval = bar.get_height()
    plt.text(bar.get_x() + bar.get_width()/2, yval + 100,
             f'{yval} ({yval/len(df)*100:.1f}%)',
             ha='center', va='bottom', fontweight='bold')
plt.title('Proportion of Sentences with Numbers Converted')
plt.ylabel('Count')
plt.tight_layout()
plt.show()

# 3. Top 10 Most Frequently Converted Numbers
num_freq = Counter([num for num, word in numbers_converted])
common_nums = num_freq.most_common(10)
nums, counts = zip(*common_nums)

plt.figure(figsize=(12, 7))
bars = plt.barh(nums, counts, color=palette["bar3"])
plt.gca().invert_yaxis()  # To display the most frequent number at the top
for bar in bars:
    plt.text(bar.get_width() - (0.02 * max(counts)), bar.get_y() + bar.get_height()/2,
             str(int(bar.get_width())), va='center', ha='right', color='white', fontweight='bold')
plt.title('Top 10 Most Frequently Converted Numbers', fontsize=15)
plt.xlabel('Frequency', fontsize=12)
plt.ylabel('Number', fontsize=12)
plt.tight_layout()
plt.show()

sentences = df['sentence'].tolist()

# Calculate Unique Word Count
unique_words = set(word for sentence in sentences for word in sentence.split())
print(f"Number of unique words: {len(unique_words)}")

# Initial Letters Distribution
initial_letters = [word[0].lower() for sentence in sentences for word in sentence.split()]
initial_letter_freq = Counter(initial_letters)

# Vowel and Consonant Distribution
vowels = set("aeiou")
num_vowels = sum(1 for word in ''.join(sentences).lower() if word in vowels)
num_consonants = sum(1 for word in ''.join(sentences).lower() if word.isalpha() and word not in vowels)
print(f"\nThe number of vowels are: {num_vowels}")
print(f"The number of consonants are: {num_consonants}")

Number of unique words: 17388

The number of vowels are: 534404
The number of consonants are: 858424

# Calculate sentence lengths
sentence_lengths = [len(nltk.word_tokenize(line)) for line in df['sentence']]

# Statistics
average_length = np.mean(sentence_lengths)
shortest_length = np.min(sentence_lengths)
longest_length = np.max(sentence_lengths)

# Print statistics
print("Average sentence length:", average_length)
print("Median sentence length:", np.median(sentence_lengths))
print("Standard deviation of sentence length:", np.std(sentence_lengths, ddof=1))
print("Minimum sentence length:", shortest_length)
print("Maximum sentence length:", longest_length)

# Histogram for Sentence Lengths Distribution
plt.figure(figsize=(10, 6))
plt.hist(sentence_lengths, bins=30, edgecolor='k', alpha=0.7, color="#3498DB")
plt.title('Sentence Lengths Distribution')
plt.xlabel('Sentence Length')
plt.ylabel('Number of Sentences')
plt.show()

# Bar plot for Average, Shortest, and Longest sentence lengths
plt.figure(figsize=(10, 6))
sentence_labels = ['Average', 'Shortest', 'Longest']
lengths = [average_length, shortest_length, longest_length]
sns.barplot(x=sentence_labels, y=lengths, palette="Blues_d")
plt.title('Sentence Lengths Overview')
plt.ylabel('Number of Words')
plt.show()


# Unique Word Count Visualization
plt.figure(figsize=(5, 6))
sns.barplot(x=['Unique Words'], y=[len(unique_words)], palette="Purples_d")
plt.title('Unique Word Count')
plt.show()

# Vowel vs. Consonant Distribution Visualization
plt.figure(figsize=(8, 8))
labels = ['Vowels', 'Consonants']
sizes = [num_vowels, num_consonants]
colors = ['#ff9999','#66b2b2']
plt.pie(sizes, labels=labels, colors=colors, autopct='%1.1f%%', startangle=90)
plt.title('Vowel vs. Consonant Distribution')
plt.axis('equal')  # Equal aspect ratio ensures pie is drawn as a circle.
plt.show()

# Initial Letter Distribution Visualization
plt.figure(figsize=(14, 8))
letters, counts = zip(*initial_letter_freq.most_common())
sns.barplot(x=list(letters), y=list(counts), palette="viridis")
plt.title('Initial Letter Distribution')
plt.xlabel('Initial Letter')
plt.ylabel('Count')
plt.show()

# Number-to-Word Conversions Table
print("\nNumbers converted to words:")
for number, words in numbers_converted:
    print(f"{number}: {words.replace('-', ' ')}")

# Insights from DataFrame Visualization
plt.figure(figsize=(10, 6))
df_lengths = [df['sentence'].str.split().apply(len).mean(), df['sentence'].str.split().apply(len).min(), df['sentence'].str.split().apply(len).max()]
df_labels = ['Average', 'Shortest', 'Longest']
sns.barplot(x=df_labels, y=df_lengths, palette="Greens_d")
plt.title('Sentence Lengths from DataFrame')
plt.ylabel('Number of Words')
plt.show()

Average sentence length: 7.540958572394686
Median sentence length: 6.0
Standard deviation of sentence length: 3.871960425939225
Minimum sentence length: 3
Maximum sentence length: 30

Numbers converted to words:
4: four
24: twenty four
1972: one thousand, nine hundred and seventy two
1: one
20: twenty
2: two
2012: two thousand and twelve
110: one hundred and ten
6: six
14: fourteen
31: thirty one
1964: one thousand, nine hundred and sixty four
1965: one thousand, nine hundred and sixty five
69: sixty nine
1: one
1966: one thousand, nine hundred and sixty six
67: sixty seven
1: one
10: ten
230: two hundred and thirty
1968: one thousand, nine hundred and sixty eight
1969: one thousand, nine hundred and sixty nine
80: eighty
17: seventeen
50: fifty
60: sixty
200: two hundred
180: one hundred and eighty
10: ten
20: twenty
195: one hundred and ninety five
249: two hundred and forty nine
300: three hundred
500: five hundred
350: three hundred and fifty
25: twenty five
20: twenty
65: sixty five
10: ten
2005: two thousand and five
000: zero
50: fifty
1: one
40: forty
60: sixty
23: twenty three
5: five
10: ten
10: ten
000: zero
180: one hundred and eighty
54: fifty four
1: one
300: three hundred
45: forty five
50: fifty
5: five
20: twenty
80: eighty
10: ten
100: one hundred
3: three
10: ten
21: twenty one
21: twenty one
400: four hundred
5: five
40: forty
35: thirty five
5: five
1: one
15: fifteen
250: two hundred and fifty
15: fifteen
25: twenty five
27: twenty seven
60: sixty
4: four
500: five hundred
6: six
6: six
20: twenty
68: sixty eight
7: seven
24: twenty four
2: two
3: three
2000: two thousand
40: forty
39: thirty nine
1940: one thousand, nine hundred and forty
75: seventy five
000: zero
100: one hundred
1: one
800: eight hundred
108: one hundred and eight
012: twelve
2012: two thousand and twelve
20: twenty
10: ten
85: eighty five
31: thirty one
300: three hundred
9: nine
3: three
150: one hundred and fifty
20: twenty
500: five hundred
000: zero
100: one hundred
1734: one thousand, seven hundred and thirty four
147: one hundred and forty seven
85: eighty five
55: fifty five
14: fourteen
15: fifteen
80: eighty
000: zero
90: ninety
200: two hundred
25: twenty five
300: three hundred
75: seventy five
10: ten
5: five
22: twenty two
150: one hundred and fifty
45: forty five
2001: two thousand and one
10: ten
6: six
300: three hundred
12: twelve
185: one hundred and eighty five
80: eighty
14: fourteen
30: thirty
40: forty
29: twenty nine
14: fourteen
300: three hundred
1: one
810: eight hundred and ten
18: eighteen
18: eighteen
40: forty
60: sixty
35: thirty five
000: zero
000: zero
9: nine
15: fifteen
17: seventeen
12: twelve
25: twenty five
30: thirty
21: twenty one
300: three hundred
10: ten
2009: two thousand and nine
45: forty five
30: thirty
59: fifty nine
7: seven
24: twenty four
5: five
000: zero
30: thirty
86: eighty six
32: thirty two
10: ten
5: five
70: seventy
100: one hundred
50: fifty
28: twenty eight
65: sixty five
45: forty five
30: thirty
40: forty
000: zero
1642: one thousand, six hundred and forty two
65: sixty five
12: twelve
1527: one thousand, five hundred and twenty seven
150: one hundred and fifty
250: two hundred and fifty
60: sixty
1900: one thousand, nine hundred
140: one hundred and forty
10: ten
12: twelve
13: thirteen
35: thirty five
12: twelve
40: forty
50: fifty
50: fifty
50: fifty
9: nine
5: five
1: one
125: one hundred and twenty five
5: five
28: twenty eight
24: twenty four
000: zero
18: eighteen
78: seventy eight
100: one hundred
10: ten
100: one hundred
18: eighteen
25: twenty five
1979: one thousand, nine hundred and seventy nine
33: thirty three
100: one hundred
100: one hundred
50: fifty
80: eighty
20: twenty
30: thirty
1: one
1709: one thousand, seven hundred and nine
1710: one thousand, seven hundred and ten
30: thirty
50: fifty
400: four hundred
218: two hundred and eighteen
97: ninety seven
24: twenty four
28: twenty eight
4: four
520: five hundred and twenty
32: thirty two
2013: two thousand and thirteen
2013: two thousand and thirteen
10: ten
160: one hundred and sixty
250: two hundred and fifty
2012: two thousand and twelve
78: seventy eight
2010: two thousand and ten
1980: one thousand, nine hundred and eighty
600: six hundred
500: five hundred
1: one
000: zero
30: thirty
400: four hundred
10: ten
200: two hundred
27: twenty seven
5: five
14: fourteen
90: ninety
25: twenty five
20: twenty
41: forty one
25: twenty five
100: one hundred
150: one hundred and fifty
20: twenty
30: thirty
25: twenty five
60: sixty
41: forty one
40: forty
47: forty seven
20: twenty
50: fifty
60: sixty
50: fifty
400: four hundred
1: one
20: twenty
140: one hundred and forty
1: one
80: eighty
20: twenty
95: ninety five
18: eighteen
500: five hundred
40: forty
50: fifty
500: five hundred
1: one
2015: two thousand and fifteen
100: one hundred
2: two
190: one hundred and ninety
400: four hundred
5: five
10: ten
50: fifty
5: five
36: thirty six
000: zero
67: sixty seven
100: one hundred
300: three hundred
125: one hundred and twenty five
30: thirty
11: eleven
16: sixteen
24: twenty four
2: two
4: four
3: three
30: thirty
2010: two thousand and ten
2010: two thousand and ten
2011: two thousand and eleven
1: one
20: twenty
40: forty
40: forty
8: eight
30: thirty
1947: one thousand, nine hundred and forty seven
60: sixty
5: five
60: sixty
12: twelve
12: twelve
1: one
12: twelve
1: one
12: twelve
2: two
2: two
12: twelve
12: twelve
12: twelve
2: two
100: one hundred
50: fifty
100: one hundred
1: one
12: twelve
000: zero
12: twelve
6: six
12: twelve
2000: two thousand
12: twelve
999: nine hundred and ninety nine
14: fourteen
12: twelve
000: zero
4: four
12: twelve
1: one
12: twelve
12: twelve
14: fourteen
12: twelve
000: zero
500: five hundred
500: five hundred
000: zero
10: ten
12: twelve
1: one
12: twelve
12: twelve
300: three hundred
500: five hundred
7: seven
000: zero
12: twelve
12: twelve
2: two
12: twelve
2011: two thousand and eleven
000: zero
14: fourteen
000: zero
63: sixty three
64: sixty four
180: one hundred and eighty
30: thirty
11: eleven
35: thirty five
8: eight
600: six hundred
95: ninety five
000: zero
450: four hundred and fifty
475: four hundred and seventy five
5: five
37: thirty seven
31: thirty one
35: thirty five
65: sixty five
000: zero
20: twenty
1835: one thousand, eight hundred and thirty five
54: fifty four
67: sixty seven
000: zero
000: zero
32: thirty two
000: zero
30: thirty
40: forty
58: fifty eight
000: zero
960: nine hundred and sixty
24: twenty four
27: twenty seven
000: zero
32: thirty two
89: eighty nine
70: seventy
48: forty eight
000: zero
62: sixty two
58: fifty eight
000: zero
12: twelve
69: sixty nine
160: one hundred and sixty
145: one hundred and forty five
500: five hundred
100: one hundred
93: ninety three
500: five hundred
30: thirty
50: fifty
1: one
13: thirteen
200: two hundred
000: zero
6: six
600: six hundred
63: sixty three
400: four hundred
60: sixty
1: one
1: one
000: zero
1: one
1: one
300: three hundred
25: twenty five
000: zero
24: twenty four
90: ninety
110: one hundred and ten
1935: one thousand, nine hundred and thirty five
30: thirty
50: fifty
800: eight hundred
100: one hundred
150: one hundred and fifty
250: two hundred and fifty
22: twenty two
200: two hundred
35: thirty five
2012: two thousand and twelve
1851: one thousand, eight hundred and fifty one
10: ten
24: twenty four
12: twelve
1943: one thousand, nine hundred and forty three
60: sixty
617: six hundred and seventeen
1533: one thousand, five hundred and thirty three
200: two hundred
1537: one thousand, five hundred and thirty seven
56: fifty six
26: twenty six
14: fourteen
2012: two thousand and twelve
20: twenty
30: thirty
21: twenty one
99: ninety nine
15: fifteen
12: twelve
900: nine hundred
1: one
1940: one thousand, nine hundred and forty
53: fifty three
400: four hundred
40: forty
1: one
2: two
52: fifty two
28: twenty eight
25: twenty five
48: forty eight
2008: two thousand and eight
60: sixty
75: seventy five
80: eighty
8: eight
5: five
100: one hundred
62: sixty two
12: twelve
11: eleven
371: three hundred and seventy one
371: three hundred and seventy one
000: zero
11: eleven
80: eighty
23: twenty three
40: forty
000: zero
6: six
2003: two thousand and three
93: ninety three
6: six
20: twenty
2003: two thousand and three
205: two hundred and five
69: sixty nine
33: thirty three
000: zero
59: fifty nine
56: fifty six
000: zero
40: forty
15: fifteen
70: seventy
50: fifty
60: sixty
29: twenty nine
45: forty five
80: eighty
100: one hundred
40: forty
8: eight
30: thirty
1: one
12: twelve
000: zero
55: fifty five
000: zero
10: ten
135: one hundred and thirty five
160: one hundred and sixty
110: one hundred and ten
90: ninety
15: fifteen
150: one hundred and fifty
16: sixteen
170: one hundred and seventy
20: twenty
54: fifty four
195: one hundred and ninety five
10: ten
450: four hundred and fifty
20: twenty
450: four hundred and fifty
400: four hundred
1: one
12: twelve
8: eight
200: two hundred
2: two
1: one
2010: two thousand and ten
25: twenty five
46: forty six
000: zero
62: sixty two
59: fifty nine
1995: one thousand, nine hundred and ninety five
000: zero
51: fifty one
56: fifty six
000: zero
000: zero
24: twenty four
66: sixty six
36: thirty six
57: fifty seven
000: zero
200: two hundred
1: one
76: seventy six
40: forty
20: twenty
16: sixteen
2012: two thousand and twelve
25: twenty five
50: fifty
14: fourteen
2013: two thousand and thirteen
61: sixty one
22: twenty two
11: eleven
66: sixty six
67: sixty seven
1846: one thousand, eight hundred and forty six
75: seventy five
120: one hundred and twenty
75: seventy five
5: five
5: five
11: eleven
3: three
50: fifty
2013: two thousand and thirteen
20: twenty
40: forty
3: three
35: thirty five
24: twenty four
12: twelve
10: ten
18: eighteen
18: eighteen
65: sixty five
100: one hundred
50: fifty
57: fifty seven
300: three hundred
50: fifty
8: eight
75: seventy five
30: thirty
50: fifty
28: twenty eight
10: ten
20: twenty
400: four hundred
500: five hundred
150: one hundred and fifty
7: seven
18: eighteen
26: twenty six
1984: one thousand, nine hundred and eighty four
462: four hundred and sixty two
2014: two thousand and fourteen
72: seventy two
1820: one thousand, eight hundred and twenty
2: two
1754: one thousand, seven hundred and fifty four
10: ten
10: ten
60: sixty
75: seventy five
94: ninety four
50: fifty
100: one hundred
370: three hundred and seventy
40: forty
50: fifty
51: fifty one
200: two hundred
1: one
70: seventy
100: one hundred
200: two hundred
1893: one thousand, eight hundred and ninety three
1991: one thousand, nine hundred and ninety one
100: one hundred
25: twenty five
100: one hundred
30: thirty
10: ten
1: one
200: two hundred
300: three hundred
4: four
120: one hundred and twenty
180: one hundred and eighty
230: two hundred and thirty
1946: one thousand, nine hundred and forty six
20: twenty
200: two hundred
400: four hundred
200: two hundred
500: five hundred
800: eight hundred
10: ten
20: twenty
200: two hundred
70: seventy
10: ten
30: thirty
10: ten
2: two
150: one hundred and fifty
31: thirty one
30: thirty
50: fifty
20: twenty
30: thirty
40: forty
10: ten
500: five hundred
12: twelve
30: thirty
25: twenty five
30: thirty
139: one hundred and thirty nine
25: twenty five
10: ten
20: twenty
638: six hundred and thirty eight
13: thirteen
10: ten
12: twelve
12: twelve
28: twenty eight
35: thirty five
60: sixty
100: one hundred
5: five
40: forty
12: twelve
25: twenty five
25: twenty five
9: nine
5: five
15: fifteen
25: twenty five
000: zero
150: one hundred and fifty
100: one hundred
150: one hundred and fifty
100: one hundred
150: one hundred and fifty
30: thirty
45: forty five
930: nine hundred and thirty
9: nine
4: four
1: one
48: forty eight
400: four hundred
2: two
707: seven hundred and seven
707: seven hundred and seven
50: fifty
2: two
50: fifty
2: two
350: three hundred and fifty
14: fourteen
24: twenty four
25: twenty five
30: thirty
12: twelve
50: fifty
2: two
975: nine hundred and seventy five
300: three hundred
200: two hundred
2: two
600: six hundred
000: zero
5: five
12: twelve
45: forty five
120: one hundred and twenty
800: eight hundred
1: one
38: thirty eight
1: one
2012: two thousand and twelve
1: one
30: thirty
50: fifty
60: sixty
5: five
300: three hundred
200: two hundred
1947: one thousand, nine hundred and forty seven
52: fifty two
90: ninety
2000: two thousand
40: forty
100: one hundred
000: zero
95: ninety five
40: forty
14: fourteen
2: two
79: seventy nine
45: forty five
100: one hundred
500: five hundred
600: six hundred
100: one hundred
2: two
194: one hundred and ninety four
15: fifteen
1779: one thousand, seven hundred and seventy nine
60: sixty
25: twenty five
200: two hundred
250: two hundred and fifty
30: thirty
600: six hundred
100: one hundred
30: thirty
50: fifty
80: eighty
65: sixty five
25: twenty five
20: twenty
180: one hundred and eighty
250: two hundred and fifty
50: fifty
5: five
000: zero
10: ten
200: two hundred
1800: one thousand, eight hundred
1910: one thousand, nine hundred and ten
10: ten
12: twelve
75: seventy five
100: one hundred
15: fifteen
70: seventy
60: sixty
99: ninety nine
45: forty five
5: five
000: zero
40: forty
60: sixty
12: twelve
15: fifteen
400: four hundred
600: six hundred
1: one
853: eight hundred and fifty three
46: forty six
500: five hundred
25: twenty five
1888: one thousand, eight hundred and eighty eight
250: two hundred and fifty
10: ten
5: five
1958: one thousand, nine hundred and fifty eight
85: eighty five
600: six hundred
800: eight hundred
2006: two thousand and six
15: fifteen
200: two hundred
300: three hundred
38: thirty eight
28: twenty eight
100: one hundred
60: sixty
300: three hundred
300: three hundred
75: seventy five
1899: one thousand, eight hundred and ninety nine
300: three hundred
2: two
200: two hundred
21: twenty one
100: one hundred
40: forty
125: one hundred and twenty five
24: twenty four
50: fifty
23: twenty three
13: thirteen
10: ten
120: one hundred and twenty
50: fifty
38: thirty eight
60: sixty
30: thirty
150: one hundred and fifty
12: twelve
2: two
32: thirty two
2012: two thousand and twelve
800: eight hundred
1907: one thousand, nine hundred and seven
50: fifty
50: fifty
35: thirty five
25: twenty five
1: one
90: ninety
450: four hundred and fifty
900: nine hundred
400: four hundred
500: five hundred
750: seven hundred and fifty
58: fifty eight
370: three hundred and seventy
42: forty two
10: ten
150: one hundred and fifty
50: fifty
1901: one thousand, nine hundred and one
5: five
10: ten
30: thirty
20: twenty
4: four
1985: one thousand, nine hundred and eighty five
175: one hundred and seventy five
12: twelve
11: eleven
60: sixty
12: twelve
100: one hundred
100: one hundred
500: five hundred
11: eleven
100: one hundred
300: three hundred
50: fifty
48: forty eight
30: thirty
1924: one thousand, nine hundred and twenty four
160: one hundred and sixty
80: eighty
24: twenty four
15: fifteen
600: six hundred
2: two
1: one
20: twenty
40: forty
50: fifty
15: fifteen
10: ten
10: ten
25: twenty five
70: seventy
20: twenty
100: one hundred
200: two hundred
15: fifteen
34: thirty four
20: twenty
20: twenty
26: twenty six
3: three
32: thirty two
20: twenty
5: five
60: sixty
800: eight hundred
100: one hundred
20: twenty
65: sixty five
50: fifty
20: twenty
10: ten
150: one hundred and fifty
2: two
63: sixty three
33: thirty three
100: one hundred
000: zero
000: zero
500: five hundred
90: ninety
50: fifty
20: twenty
1900: one thousand, nine hundred
10: ten
200: two hundred
17: seventeen
30: thirty
24: twenty four
120: one hundred and twenty
100: one hundred
100: one hundred
1: one
120: one hundred and twenty
27: twenty seven
1934: one thousand, nine hundred and thirty four
673: six hundred and seventy three
29: twenty nine
30: thirty
6: six
600: six hundred
200: two hundred
62: sixty two
100: one hundred
3: three
180: one hundred and eighty
142: one hundred and forty two
100: one hundred
1958: one thousand, nine hundred and fifty eight
25: twenty five
16: sixteen
300: three hundred
400: four hundred
000: zero
30: thirty
10: ten
12: twelve
24: twenty four
300: three hundred
40: forty
80: eighty
400: four hundred
8: eight
200: two hundred
300: three hundred
800: eight hundred
12: twelve
000: zero
20: twenty
7: seven
0: zero
40: forty
40: forty
75: seventy five
20: twenty
4: four
4: four
29: twenty nine
1770: one thousand, seven hundred and seventy
000: zero
627: six hundred and twenty seven
465: four hundred and sixty five
375: three hundred and seventy five
385: three hundred and eighty five
3: three
2011: two thousand and eleven
1942: one thousand, nine hundred and forty two
60: sixty
7: seven
18: eighteen
000: zero
2001: two thousand and one
11: eleven
8: eight
5: five
180: one hundred and eighty
30: thirty
75: seventy five
1993: one thousand, nine hundred and ninety three
1978: one thousand, nine hundred and seventy eight
18: eighteen
20: twenty
3: three
1: one
24: twenty four
20: twenty
24: twenty four
150: one hundred and fifty
15: fifteen
1850: one thousand, eight hundred and fifty
1035: one thousand and thirty five
100: one hundred
000: zero
20: twenty
1887: one thousand, eight hundred and eighty seven
26: twenty six
15: fifteen
155: one hundred and fifty five
30: thirty
240: two hundred and forty
15: fifteen
115: one hundred and fifteen
10: ten
1: one
10: ten
94: ninety four
24: twenty four
2: two
2: two
300: three hundred
450: four hundred and fifty
65: sixty five
100: one hundred
100: one hundred
195: one hundred and ninety five
300: three hundred
165: one hundred and sixty five
37: thirty seven
1: one
28: twenty eight
1814: one thousand, eight hundred and fourteen
200: two hundred
100: one hundred
75: seventy five
100: one hundred
450: four hundred and fifty
32: thirty two
20: twenty
246: two hundred and forty six
270: two hundred and seventy
400: four hundred
125: one hundred and twenty five
380: three hundred and eighty
125: one hundred and twenty five
200: two hundred
250: two hundred and fifty
21: twenty one
1: one
450: four hundred and fifty
40: forty
15: fifteen
170: one hundred and seventy
15: fifteen
700: seven hundred
9: nine
200: two hundred
400: four hundred
75: seventy five
600: six hundred
300: three hundred
170: one hundred and seventy
10: ten
2014: two thousand and fourteen
000: zero
3: three
000: zero
30: thirty
1948: one thousand, nine hundred and forty eight
15: fifteen
50: fifty
20: twenty
18: eighteen
18: eighteen
32: thirty two
100: one hundred
1: one
500: five hundred
1338: one thousand, three hundred and thirty eight
12: twelve
24: twenty four
000: zero
10: ten
1545: one thousand, five hundred and forty five
25: twenty five
2008: two thousand and eight
500: five hundred
112: one hundred and twelve
16: sixteen
35: thirty five
500: five hundred
10: ten
4: four
700: seven hundred
17: seventeen
17: seventeen
2: two
8: eight
17: seventeen
1: one
100: one hundred
2014: two thousand and fourteen
515: five hundred and fifteen
210: two hundred and ten
125: one hundred and twenty five
550: five hundred and fifty
625: six hundred and twenty five
430: four hundred and thirty
150: one hundred and fifty
15: fifteen
75: seventy five
190: one hundred and ninety
133: one hundred and thirty three
295: two hundred and ninety five
37: thirty seven
000: zero
65: sixty five
425: four hundred and twenty five
147: one hundred and forty seven
8: eight
10: ten
62: sixty two
110: one hundred and ten
15: fifteen
20: twenty
20: twenty
21: twenty one
14: fourteen
3: three
25: twenty five
30: thirty
100: one hundred
110: one hundred and ten
800: eight hundred
200: two hundred
000: zero
20: twenty
1: one
400: four hundred
12: twelve
24: twenty four
6000: six thousand
135: one hundred and thirty five
20: twenty
30: thirty
50: fifty
15: fifteen
40: forty
000: zero
1905: one thousand, nine hundred and five
100: one hundred
3: three
100: one hundred
100: one hundred
50: fifty
2014: two thousand and fourteen
1984: one thousand, nine hundred and eighty four
2014: two thousand and fourteen
14: fourteen
87: eighty seven
3: three
20: twenty
65: sixty five
170: one hundred and seventy
11: eleven
2001: two thousand and one
96: ninety six
20: twenty
1945: one thousand, nine hundred and forty five
1965: one thousand, nine hundred and sixty five
1739: one thousand, seven hundred and thirty nine
28: twenty eight
161: one hundred and sixty one
35: thirty five
000: zero
14: fourteen
70: seventy
10: ten
20: twenty
40: forty
1606: one thousand, six hundred and six
10: ten
20: twenty
25: twenty five
300: three hundred
500: five hundred
200: two hundred
1996: one thousand, nine hundred and ninety six
95: ninety five
18: eighteen
59: fifty nine
23: twenty three
18: eighteen
12: twelve
21: twenty one
1976: one thousand, nine hundred and seventy six
25: twenty five
9: nine
458: four hundred and fifty eight
40: forty
11: eleven
15: fifteen
11: eleven
75: seventy five
15: fifteen
15: fifteen
20: twenty
1: one
10: ten
20: twenty
40: forty
85: eighty five
35: thirty five
5: five
1949: one thousand, nine hundred and forty nine
20: twenty
40: forty
11: eleven
70: seventy
30: thirty
000: zero
50: fifty
000: zero
65: sixty five
2016: two thousand and sixteen
28: twenty eight
100: one hundred
150: one hundred and fifty
2050: two thousand and fifty
100: one hundred
1963: one thousand, nine hundred and sixty three
20: twenty
80: eighty
100: one hundred
20: twenty
17: seventeen
000: zero
300: three hundred
46: forty six
700: seven hundred
1527: one thousand, five hundred and twenty seven
82: eighty two
60: sixty
18: eighteen
20: twenty
55: fifty five
1958: one thousand, nine hundred and fifty eight
16: sixteen
13: thirteen
12: twelve
000: zero
100: one hundred
14: fourteen
30: thirty
155: one hundred and fifty five
50: fifty
22: twenty two
000: zero
000: zero
40: forty
100: one hundred
150: one hundred and fifty
52: fifty two
11: eleven
24: twenty four
300: three hundred
400: four hundred
30: thirty
40: forty
30: thirty
27: twenty seven
46: forty six
300: three hundred
10: ten
4: four
7: seven
40: forty
5: five
5: five
80: eighty
80: eighty
150: one hundred and fifty
76: seventy six
20: twenty
2013: two thousand and thirteen
40: forty
20: twenty
930: nine hundred and thirty
1800: one thousand, eight hundred
680: six hundred and eighty
500: five hundred
80: eighty
90: ninety
95: ninety five
50: fifty
11: eleven
24: twenty four
300: three hundred
3: three
000: zero
89: eighty nine
20: twenty
40: forty
5: five
191: one hundred and ninety one
17: seventeen
1963: one thousand, nine hundred and sixty three
40: forty
60: sixty
40: forty
50: fifty
4: four
96: ninety six
0: zero
150: one hundred and fifty
180: one hundred and eighty
5: five
200: two hundred
300: three hundred
26: twenty six
11: eleven
15: fifteen
20: twenty
200: two hundred
120: one hundred and twenty
1: one
20: twenty
90: ninety
100: one hundred
200: two hundred
300: three hundred
100: one hundred
10: ten
20: twenty
82: eighty two
20: twenty
200: two hundred
100: one hundred
200: two hundred
100: one hundred
30: thirty
60: sixty
200: two hundred
200: two hundred
3: three
97: ninety seven
5: five
1: one
5: five
85: eighty five
40: forty
15: fifteen
20: twenty
2: two
4: four
2: two
500: five hundred
20: twenty
20: twenty
25: twenty five
30: thirty
27: twenty seven
000: zero
50: fifty
12: twelve
1: one
12: twelve
1: one
12: twelve
30: thirty
1: one
2: two
000: zero
520: five hundred and twenty
30: thirty
18: eighteen
18: eighteen
16: sixteen
18: eighteen
15: fifteen
20: twenty
7: seven
460: four hundred and sixty
1: one
30: thirty
3: three
2: two
12: twelve
2022: two thousand and twenty two
63: sixty three
3: three
15: fifteen
10: ten
2010: two thousand and ten
1923: one thousand, nine hundred and twenty three
11: eleven
25: twenty five
21: twenty one
100: one hundred
140: one hundred and forty
150: one hundred and fifty
200: two hundred
90: ninety
75: seventy five
20: twenty
16: sixteen
1723: one thousand, seven hundred and twenty three
150: one hundred and fifty
100: one hundred
200: two hundred
50: fifty
100: one hundred
90: ninety
200: two hundred
300: three hundred
30: thirty
24: twenty four
5: five
000: zero
000: zero
1: one
70: seventy
2: two
84: eighty four
1988: one thousand, nine hundred and eighty eight
15: fifteen
150: one hundred and fifty
29: twenty nine
38: thirty eight
1991: one thousand, nine hundred and ninety one
40: forty
55: fifty five
15: fifteen
20: twenty
47: forty seven
28: twenty eight
30: thirty
25: twenty five
98: ninety eight
230: two hundred and thirty
11: eleven
30: thirty
28: twenty eight
1605: one thousand, six hundred and five
40: forty
16: sixteen
80: eighty
120: one hundred and twenty
8: eight
60: sixty
20: twenty
30: thirty
1989: one thousand, nine hundred and eighty nine
20: twenty
18: eighteen
1987: one thousand, nine hundred and eighty seven
1926: one thousand, nine hundred and twenty six
13: thirteen
14: fourteen
250: two hundred and fifty
150: one hundred and fifty
400: four hundred
500: five hundred
120: one hundred and twenty
40: forty
40: forty
7: seven
9: nine
12: twelve
30: thirty
90: ninety
15: fifteen
150: one hundred and fifty
100: one hundred
180: one hundred and eighty
000: zero
175: one hundred and seventy five
150: one hundred and fifty
1828: one thousand, eight hundred and twenty eight
70: seventy
56: fifty six
10: ten
15: fifteen
20: twenty
130: one hundred and thirty
15: fifteen
20: twenty
12: twelve
330: three hundred and thirty
400: four hundred
100: one hundred
100: one hundred
100: one hundred
1911: one thousand, nine hundred and eleven
10: ten
340: three hundred and forty
75: seventy five
60: sixty
1828: one thousand, eight hundred and twenty eight
70: seventy
14: fourteen
15: fifteen
1651: one thousand, six hundred and fifty one
95: ninety five
20: twenty
50: fifty
60: sixty
520: five hundred and twenty
750: seven hundred and fifty
1948: one thousand, nine hundred and forty eight
800: eight hundred
20: twenty
25: twenty five
25: twenty five
60: sixty
500: five hundred
000: zero
58: fifty eight
20: twenty
750: seven hundred and fifty
90: ninety
10: ten
15: fifteen
1: one
12: twelve
000: zero
000: zero
20: twenty
24: twenty four
34: thirty four
729: seven hundred and twenty nine
20: twenty
10: ten
100: one hundred
130: one hundred and thirty
26: twenty six
000: zero
800: eight hundred
37: thirty seven
34: thirty four
90: ninety
15: fifteen
465: four hundred and sixty five
25: twenty five
000: zero
800: eight hundred
15: fifteen
3: three
50: fifty
2: two
18: eighteen
400: four hundred
415: four hundred and fifteen
30: thirty
19: nineteen
23: twenty three
400: four hundred
1706: one thousand, seven hundred and six
16: sixteen
20: twenty
200: two hundred
21: twenty one
500: five hundred
125: one hundred and twenty five
45: forty five
28: twenty eight
29: twenty nine
54: fifty four
16: sixteen
6: six
60: sixty
20: twenty
67: sixty seven
22: twenty two
60: sixty
000: zero
15: fifteen
200: two hundred
5: five
600: six hundred
15: fifteen
1933: one thousand, nine hundred and thirty three
10: ten
40: forty
90: ninety
000: zero
100: one hundred
000: zero
55: fifty five
400: four hundred
30: thirty
000: zero
246: two hundred and forty six
68: sixty eight
30: thirty
135: one hundred and thirty five
160: one hundred and sixty
175: one hundred and seventy five
000: zero
000: zero
2006: two thousand and six
57: fifty seven
185: one hundred and eighty five
125: one hundred and twenty five
205: two hundred and five
950: nine hundred and fifty
60: sixty
125: one hundred and twenty five
450: four hundred and fifty
385: three hundred and eighty five
5: five
7: seven
3: three
31: thirty one
80: eighty
69: sixty nine
8: eight
000: zero
350: three hundred and fifty
400: four hundred
550: five hundred and fifty
600: six hundred
650: six hundred and fifty
000: zero
000: zero
165: one hundred and sixty five
15: fifteen
90: ninety
20: twenty
10: ten
85: eighty five
100: one hundred
2: two
120: one hundred and twenty
13: thirteen
260: two hundred and sixty
27: twenty seven
61: sixty one
300: three hundred
165: one hundred and sixty five
175: one hundred and seventy five
50: fifty
225: two hundred and twenty five
35: thirty five
35: thirty five
42: forty two
500: five hundred
70: seventy
700: seven hundred
245: two hundred and forty five
90: ninety
120: one hundred and twenty
200: two hundred
65: sixty five
50: fifty
2: two
60: sixty
8: eight
425: four hundred and twenty five
225: two hundred and twenty five
1: one
200: two hundred
99: ninety nine
50: fifty
000: zero
400: four hundred
500: five hundred
100: one hundred
300: three hundred
500: five hundred
99: ninety nine
3: three
1: one
35: thirty five
270: two hundred and seventy
32: thirty two
32: thirty two
30: thirty
20: twenty
130: one hundred and thirty
300: three hundred
23: twenty three
12: twelve
22: twenty two
20: twenty
22: twenty two
29: twenty nine
000: zero
60: sixty
10: ten
15: fifteen
239: two hundred and thirty nine
14: fourteen
250: two hundred and fifty
16: sixteen
350: three hundred and fifty
370: three hundred and seventy
350: three hundred and fifty
000: zero
99: ninety nine
53: fifty three
5: five
75: seventy five
40: forty
40: forty
30: thirty
2017: two thousand and seventeen
2: two
16: sixteen
18: eighteen
5: five
10: ten
20: twenty
6: six
15: fifteen
20: twenty
000: zero
1918: one thousand, nine hundred and eighteen
1923: one thousand, nine hundred and twenty three
53: fifty three
55: fifty five
130: one hundred and thirty
12: twelve
69: sixty nine
160: one hundred and sixty
36: thirty six
200: two hundred
100: one hundred
150: one hundred and fifty
95: ninety five
9: nine
155: one hundred and fifty five
10: ten
25: twenty five
30: thirty
235: two hundred and thirty five
125: one hundred and twenty five
110: one hundred and ten
1981: one thousand, nine hundred and eighty one
12: twelve
50: fifty
10: ten
1: one
1967: one thousand, nine hundred and sixty seven
90: ninety
400: four hundred
40: forty
4: four
000: zero
50: fifty
30: thirty
100: one hundred
25: twenty five
8: eight
1745: one thousand, seven hundred and forty five
000: zero
8: eight
5: five
2016: two thousand and sixteen
10: ten
1: one
15: fifteen
80: eighty
35: thirty five
20: twenty
100: one hundred
50: fifty
40: forty
10: ten
10: ten
620: six hundred and twenty
12: twelve
350: three hundred and fifty
500: five hundred
14: fourteen
10: ten
11: eleven
11: eleven
10: ten
647: six hundred and forty seven
101: one hundred and one
30: thirty
200: two hundred
200: two hundred
300: three hundred
140: one hundred and forty
1918: one thousand, nine hundred and eighteen
1961: one thousand, nine hundred and sixty one
1940: one thousand, nine hundred and forty
18: eighteen
250: two hundred and fifty
165: one hundred and sixty five
85: eighty five
42: forty two
820: eight hundred and twenty
44: forty four
250: two hundred and fifty
1759: one thousand, seven hundred and fifty nine
100: one hundred
15: fifteen
60: sixty
20: twenty
30: thirty
23: twenty three
13: thirteen
7: seven
000: zero
000: zero
9000: nine thousand
966: nine hundred and sixty six
100: one hundred
55: fifty five
15: fifteen
500: five hundred
14: fourteen
65: sixty five
96: ninety six
5: five
20: twenty
35: thirty five
200: two hundred
30: thirty
1940: one thousand, nine hundred and forty
15: fifteen
18: eighteen
5: five
30: thirty
20: twenty
100: one hundred
14: fourteen
95: ninety five
4: four
2: two
1960: one thousand, nine hundred and sixty
1963: one thousand, nine hundred and sixty three
57: fifty seven
900: nine hundred
6: six
90: ninety
40: forty
000: zero
2: two
30: thirty
000: zero
2009: two thousand and nine
10: ten
10: ten
40: forty
60: sixty
25: twenty five
35: thirty five
78: seventy eight
1826: one thousand, eight hundred and twenty six
12: twelve
6: six
25: twenty five
27: twenty seven
1: one
300: three hundred
400: four hundred
100: one hundred
60: sixty
425: four hundred and twenty five
000: zero
10: ten
13: thirteen
425: four hundred and twenty five
6: six
100: one hundred
70: seventy
6: six
100: one hundred
17: seventeen
100: one hundred
120: one hundred and twenty
50: fifty
5: five
12: twelve
75: seventy five
10: ten
209: two hundred and nine
50: fifty
260: two hundred and sixty
260: two hundred and sixty
160: one hundred and sixty
14: fourteen
50: fifty
26: twenty six
18: eighteen
8: eight
70: seventy
20: twenty
500: five hundred
1: one
3: three
12: twelve
1878: one thousand, eight hundred and seventy eight
30: thirty
40: forty
6: six
2015: two thousand and fifteen
7: seven
12: twelve
1977: one thousand, nine hundred and seventy seven
30: thirty
1954: one thousand, nine hundred and fifty four
20: twenty
12: twelve
2015: two thousand and fifteen
2: two
54: fifty four
10: ten
24: twenty four
300: three hundred
218: two hundred and eighteen
35: thirty five
1951: one thousand, nine hundred and fifty one
20: twenty
90: ninety
90: ninety
15: fifteen
854: eight hundred and fifty four
1985: one thousand, nine hundred and eighty five
20: twenty
000: zero
30: thirty
3: three
3: three
5: five
49: forty nine
350: three hundred and fifty
100: one hundred
200: two hundred
105: one hundred and five
12: twelve
1: one
5: five
400: four hundred
5: five
1: one
2: two
000: zero
2011: two thousand and eleven
1911: one thousand, nine hundred and eleven
1967: one thousand, nine hundred and sixty seven
15: fifteen
11: eleven
2016: two thousand and sixteen
646: six hundred and forty six
2: two
2: two
1973: one thousand, nine hundred and seventy three
65: sixty five
100: one hundred
150: one hundred and fifty
600: six hundred
400: four hundred
500: five hundred
1994: one thousand, nine hundred and ninety four
17: seventeen
30: thirty
15: fifteen
200: two hundred
300: three hundred
15: fifteen
2016: two thousand and sixteen
50: fifty
50: fifty
100: one hundred
520: five hundred and twenty
150: one hundred and fifty
2300: two thousand, three hundred
24: twenty four
15: fifteen
40: forty
10: ten
200: two hundred
1: one
700: seven hundred
200: two hundred
18: eighteen
15: fifteen
20: twenty
14: fourteen
69: sixty nine
3: three
200: two hundred
25: twenty five
930: nine hundred and thirty
10: ten
2016: two thousand and sixteen
2016: two thousand and sixteen
2016: two thousand and sixteen
180: one hundred and eighty
13: thirteen
7: seven
1852: one thousand, eight hundred and fifty two
1: one
35: thirty five
150: one hundred and fifty
42: forty two
2: two
100: one hundred
70: seventy
100: one hundred
20: twenty
30: thirty
4: four
60: sixty
100: one hundred
100: one hundred
700: seven hundred
20: twenty
000: zero
70: seventy
1: one
20: twenty
65: sixty five
70: seventy
20: twenty
20: twenty
100: one hundred
000: zero
30: thirty
110: one hundred and ten
22: twenty two
24: twenty four
120: one hundred and twenty
10: ten
12: twelve
14: fourteen
5: five
31: thirty one
12: twelve
7: seven
9: nine
2: two
74: seventy four
1: one
2009: two thousand and nine
40: forty
1969: one thousand, nine hundred and sixty nine
1998: one thousand, nine hundred and ninety eight
14: fourteen
1973: one thousand, nine hundred and seventy three
000: zero
100: one hundred
25: twenty five
20: twenty
48: forty eight
17: seventeen
24: twenty four
4: four
000: zero
2: two
55: fifty five
4: four
24: twenty four
28: twenty eight
27: twenty seven
27: twenty seven
30: thirty
30: thirty
12: twelve
600: six hundred
500: five hundred
1835: one thousand, eight hundred and thirty five
22: twenty two
20: twenty
45: forty five
60: sixty
89: eighty nine
40: forty
0: zero
12: twelve
10: ten
1826: one thousand, eight hundred and twenty six
000: zero
5: five
1848: one thousand, eight hundred and forty eight
14: fourteen
16: sixteen
11: eleven
800: eight hundred
300: three hundred
000: zero
100: one hundred
600: six hundred
125: one hundred and twenty five
20: twenty
85: eighty five
40: forty
8: eight
18: eighteen
73: seventy three
40: forty
2: two
1948: one thousand, nine hundred and forty eight
80: eighty
500: five hundred
200: two hundred
400: four hundred
1948: one thousand, nine hundred and forty eight
20: twenty
1790: one thousand, seven hundred and ninety
1984: one thousand, nine hundred and eighty four
2: two
000: zero
400: four hundred
150: one hundred and fifty
15: fifteen
25: twenty five
000: zero
1: one
000: zero
47: forty seven
22: twenty two
5: five
40: forty
100: one hundred
40: forty
10: ten
2: two
1: one
49: forty nine
100: one hundred
350: three hundred and fifty
155: one hundred and fifty five
90: ninety
95: ninety five
000: zero
325: three hundred and twenty five
320: three hundred and twenty
325: three hundred and twenty five
10: ten
20: twenty
15: fifteen
30: thirty
80: eighty
60: sixty
15: fifteen
1: one
10: ten
20: twenty
2: two
1848: one thousand, eight hundred and forty eight
1842: one thousand, eight hundred and forty two
90: ninety
25: twenty five
30: thirty
10: ten

# Tokenize all sentences in the dataframe
all_tokens = [token for sentence in df['sentence'] for token in nltk.word_tokenize(sentence)]

# Count the frequency of each token
token_counts = Counter(all_tokens)

# Get the top 20 most frequent tokens
common_tokens = token_counts.most_common(20)

# Set a professional color palette and style
sns.set_style("ticks", {"xtick.major.size": 8, "ytick.major.size": 8})
sns.set_context("talk", font_scale=0.8)
color = '#2980B9'  # Slightly deeper shade of blue

# Plot
plt.figure(figsize=(13, 12))

# Plotting each bar with the refined color
tokens, frequencies = zip(*common_tokens)
for token, freq in common_tokens:
    plt.barh(token, freq, color=color, edgecolor='silver', height=0.7)
    plt.text(freq + 10, token, str(freq), va='center', color='black', fontsize=12)  # Adjusted annotation

# Refining title and axis labels for a polished look
plt.title('Top 20 Most Frequent Tokens', fontsize=20, fontweight='bold', pad=20)
plt.xlabel('Frequency', fontsize=16)
plt.ylabel('Tokens', fontsize=16)
plt.gca().invert_yaxis()  # To display the most frequent token at the top

# Introducing subtle gridlines for better mapping
plt.grid(axis='x', linestyle='--', alpha=0.6)

# Adjusting axis ticks for aesthetics
plt.xticks(fontsize=14)
plt.yticks(fontsize=14)

plt.tight_layout()
plt.show()

df.head()

for percentile in [25, 50, 75, 90, 95, 99]:
    print(f"{percentile}th percentile:", np.percentile(sentence_lengths, percentile))
unique_words = set(word for sentence in df['sentence'] for word in sentence.split())
print("Total unique words:", len(unique_words))
word_counts = Counter(word for sentence in df['sentence'] for word in sentence.split())
print("Most common words:", word_counts.most_common(10))
print("Least common words:", word_counts.most_common()[:-11:-1])

25th percentile: 5.0
50th percentile: 6.0
75th percentile: 9.0
90th percentile: 13.0
95th percentile: 16.0
99th percentile: 20.0
Total unique words: 17388
Most common words: [('THE', 16538), ('TO', 9609), ('A', 8610), ('AND', 8595), ('OF', 7332), ('I', 5829), ('IT', 5226), ('IN', 5052), ('THAT', 4827), ('YOU', 4757)]
Least common words: [('SEIZURES', 1), ('PERSUADERS', 1), ('BANKRUPTING', 1), ('REWROTE', 1), ('FLAWS', 1), ('RHINE', 1), ('BROCKEN', 1), ('CROWDED', 1), ("TROTSKY'S", 1), ('UNISON', 1)]

# Define a pattern for common contractions
common_contractions_pattern = r"\b(?:[a-zA-Z]+n't|[a-zA-Z]+'ll|[a-zA-Z]+'ve|[a-zA-Z]+'re|[a-zA-Z]+'d|[a-zA-Z]+'s)\b"

# Find common contractions in each line and store them
contractions_counter = Counter()
for line in df['sentence']:
    contractions_counter.update(re.findall(common_contractions_pattern, line.lower()))

# Get the most common contractions and their counts
most_common_contractions = contractions_counter.most_common()

# Calculate total contractions found
total_contractions = sum(contractions_counter.values())

most_common_contractions, total_contractions

([("it's", 2445),
  ("that's", 1015),
  ("don't", 978),
  ("you're", 522),
  ("i've", 494),
  ("we've", 492),
  ("there's", 422),
  ("we're", 416),
  ("they're", 391),
  ("let's", 358),
  ("you've", 345),
  ("can't", 313),
  ("he's", 312),
  ("didn't", 258),
  ("i'll", 201),
  ("i'd", 187),
  ("she's", 186),
  ("what's", 183),
  ("wasn't", 177),
  ("doesn't", 163),
  ("they've", 157),
  ("we'll", 155),
  ("wouldn't", 122),
  ("haven't", 100),
  ("won't", 98),
  ("you'll", 89),
  ("couldn't", 85),
  ("isn't", 85),
  ("today's", 80),
  ("you'd", 76),
  ("they'll", 68),
  ("we'd", 68),
  ("he'd", 53),
  ("weren't", 49),
  ("aren't", 41),
  ("they'd", 41),
  ("who's", 41),
  ("it'll", 38),
  ("here's", 35),
  ("hadn't", 30),
  ("year's", 27),
  ("britain's", 26),
  ("tonight's", 26),
  ("world's", 25),
  ("people's", 23),
  ("shouldn't", 22),
  ("everyone's", 21),
  ("hasn't", 20),
  ("he'll", 19),
  ("everybody's", 18),
  ("would've", 18),
  ("she'd", 15),
  ("life's", 12),
  ("mother's", 11),
  ("children's", 11),
  ("father's", 11),
  ("week's", 11),
  ("who've", 10),
  ("someone's", 9),
  ("wife's", 9),
  ("women's", 9),
  ("ain't", 9),
  ("man's", 9),
  ("nation's", 9),
  ("bbc's", 8),
  ("it'd", 8),
  ("she'll", 8),
  ("one's", 7),
  ("name's", 7),
  ("weekend's", 7),
  ("how's", 7),
  ("dad's", 7),
  ("night's", 7),
  ("that'll", 7),
  ("london's", 6),
  ("king's", 6),
  ("mum's", 6),
  ("where's", 6),
  ("time's", 6),
  ("matt's", 5),
  ("thing's", 5),
  ("market's", 5),
  ("weather's", 5),
  ("everything's", 5),
  ("there'll", 5),
  ("paul's", 5),
  ("bradshaw's", 5),
  ("queen's", 5),
  ("daren't", 4),
  ("europe's", 4),
  ("boy's", 4),
  ("country's", 4),
  ("nature's", 4),
  ("else's", 4),
  ("england's", 4),
  ("men's", 4),
  ("tv's", 4),
  ("team's", 4),
  ("something's", 4),
  ("somebody's", 4),
  ("work's", 4),
  ("phil's", 4),
  ("webster's", 4),
  ("shakespeare's", 4),
  ("peter's", 4),
  ("month's", 3),
  ("other's", 3),
  ("anything's", 3),
  ("dave's", 3),
  ("town's", 3),
  ("city's", 3),
  ("god's", 3),
  ("who'd", 3),
  ("woman's", 3),
  ("uk's", 3),
  ("kate's", 3),
  ("henry's", 3),
  ("island's", 3),
  ("county's", 3),
  ("girl's", 3),
  ("day's", 3),
  ("charlie's", 3),
  ("nobody's", 3),
  ("david's", 3),
  ("bid's", 3),
  ("grandmother's", 3),
  ("gentleman's", 3),
  ("tom's", 3),
  ("tomorrow's", 3),
  ("harm's", 3),
  ("edward's", 3),
  ("hogarth's", 3),
  ("mustn't", 3),
  ("brother's", 3),
  ("family's", 3),
  ("sun's", 2),
  ("soldier's", 2),
  ("should've", 2),
  ("son's", 2),
  ("show's", 2),
  ("christ's", 2),
  ("lawrence's", 2),
  ("money's", 2),
  ("planet's", 2),
  ("thomas's", 2),
  ("person's", 2),
  ("company's", 2),
  ("majesty's", 2),
  ("individual's", 2),
  ("buyer's", 2),
  ("mistress's", 2),
  ("george's", 2),
  ("pam's", 2),
  ("labour's", 2),
  ("club's", 2),
  ("miranda's", 2),
  ("centurion's", 2),
  ("john's", 2),
  ("gourmet's", 2),
  ("shan't", 2),
  ("november's", 2),
  ("spencer's", 2),
  ("jack's", 2),
  ("farming's", 2),
  ("maker's", 2),
  ("jesus's", 2),
  ("brand's", 2),
  ("rhod's", 2),
  ("mark's", 2),
  ("there'd", 2),
  ("when's", 2),
  ("valentine's", 2),
  ("whatever's", 2),
  ("busman's", 2),
  ("relief's", 2),
  ("item's", 2),
  ("oak's", 2),
  ("lee's", 2),
  ("georgie's", 2),
  ("summer's", 2),
  ("shepherd's", 2),
  ("nash's", 2),
  ("animal's", 2),
  ("alzheimer's", 2),
  ("doctor's", 2),
  ("husband's", 2),
  ("bobby's", 2),
  ("america's", 2),
  ("cathedral's", 2),
  ("gentlemen's", 2),
  ("tim's", 2),
  ("could've", 2),
  ("daddy's", 2),
  ("mick's", 2),
  ("emma's", 2),
  ("yesterday's", 2),
  ("television's", 2),
  ("anybody's", 2),
  ("agency's", 2),
  ("roscoff's", 2),
  ("paula's", 2),
  ("lady's", 2),
  ("saleroom's", 2),
  ("pete's", 2),
  ("goat's", 2),
  ("gully's", 1),
  ("sheep's", 1),
  ("later's", 1),
  ("barr's", 1),
  ("gaynor's", 1),
  ("bar's", 1),
  ("church's", 1),
  ("rachel's", 1),
  ("age's", 1),
  ("galileo's", 1),
  ("jennifer's", 1),
  ("kathy's", 1),
  ("titchmarsh's", 1),
  ("century's", 1),
  ("conqueror's", 1),
  ("dermot's", 1),
  ("damien's", 1),
  ("bohemond's", 1),
  ("marconi's", 1),
  ("annie's", 1),
  ("richard's", 1),
  ("topography's", 1),
  ("owner's", 1),
  ("chief's", 1),
  ("handler's", 1),
  ("hunt's", 1),
  ("government's", 1),
  ("riding's", 1),
  ("nhs'll", 1),
  ("katy's", 1),
  ("sotheby's", 1),
  ("eyre's", 1),
  ("cromwell's", 1),
  ("spix's", 1),
  ("nic's", 1),
  ("dealer's", 1),
  ("parent's", 1),
  ("frank's", 1),
  ("legion's", 1),
  ("derbyshire's", 1),
  ("cassini's", 1),
  ("newborn's", 1),
  ("garrow's", 1),
  ("clive's", 1),
  ("neck's", 1),
  ("edmund's", 1),
  ("channel's", 1),
  ("cartland's", 1),
  ("howard's", 1),
  ("bpa's", 1),
  ("wren's", 1),
  ("eamonn's", 1),
  ("daimler's", 1),
  ("juana's", 1),
  ("barrow's", 1),
  ("holly's", 1),
  ("sue's", 1),
  ("flavour's", 1),
  ("so's", 1),
  ("martin's", 1),
  ("hancock's", 1),
  ("smith's", 1),
  ("mankind's", 1),
  ("value's", 1),
  ("phone's", 1),
  ("eric's", 1),
  ("gillian's", 1),
  ("author's", 1),
  ("victoria's", 1),
  ("pamela's", 1),
  ("hour's", 1),
  ("grandfather's", 1),
  ("wheatley's", 1),
  ("jackie's", 1),
  ("malta's", 1),
  ("gormley's", 1),
  ("deer's", 1),
  ("rate's", 1),
  ("dunbar's", 1),
  ("anyone's", 1),
  ("sande's", 1),
  ("principle's", 1),
  ("gordon's", 1),
  ("julia's", 1),
  ("think's", 1),
  ("margaret's", 1),
  ("gabby's", 1),
  ("ronnie's", 1),
  ("baxter's", 1),
  ("canopy's", 1),
  ("bird's", 1),
  ("minton's", 1),
  ("alexandra's", 1),
  ("clerk's", 1),
  ("tb's", 1),
  ("chemist's", 1),
  ("fermi's", 1),
  ("jeanette's", 1),
  ("macmillan's", 1),
  ("drake's", 1),
  ("bottom's", 1),
  ("watkins's", 1),
  ("peterborough's", 1),
  ("linda's", 1),
  ("churchill's", 1),
  ("band's", 1),
  ("liverpool's", 1),
  ("bretby's", 1),
  ("auction's", 1),
  ("kitchener's", 1),
  ("blacksmith's", 1),
  ("constantine's", 1),
  ("justinian's", 1),
  ("orwell's", 1),
  ("roadshow's", 1),
  ("emperor's", 1),
  ("b's", 1),
  ("boudicca's", 1),
  ("part's", 1),
  ("alan's", 1),
  ("mortimer's", 1),
  ("commander's", 1),
  ("this'll", 1),
  ("daphne's", 1),
  ("chris's", 1),
  ("vicar's", 1),
  ("teddy's", 1),
  ("rome's", 1),
  ("devon's", 1),
  ("clayton's", 1),
  ("adam's", 1),
  ("nottingham's", 1),
  ("hollywood's", 1),
  ("andrew's", 1),
  ("denny's", 1),
  ("derby's", 1),
  ("that'd", 1),
  ("director's", 1),
  ("driver's", 1),
  ("ship's", 1),
  ("pop's", 1),
  ("sullivan's", 1),
  ("jamie's", 1),
  ("betty's", 1),
  ("dad'll", 1),
  ("lalique's", 1),
  ("laura's", 1),
  ("suzanne's", 1),
  ("jaguar's", 1),
  ("kat's", 1),
  ("kerr's", 1),
  ("tennyson's", 1),
  ("past's", 1),
  ("peacock's", 1),
  ("cow's", 1),
  ("parson's", 1),
  ("caroline's", 1),
  ("fire's", 1),
  ("friend's", 1),
  ("salesmen's", 1),
  ("darren's", 1),
  ("original's", 1),
  ("bernice's", 1),
  ("empire's", 1),
  ("marie's", 1),
  ("saul's", 1),
  ("canine's", 1),
  ("charlotte's", 1),
  ("farm's", 1),
  ("giant's", 1),
  ("damian's", 1),
  ("foxe's", 1),
  ("barbara's", 1),
  ("builder's", 1),
  ("edith's", 1),
  ("decision's", 1),
  ("ve'll", 1),
  ("hamish's", 1),
  ("tree's", 1),
  ("mcclintock's", 1),
  ("prince's", 1),
  ("cheque's", 1),
  ("australia's", 1),
  ("music's", 1),
  ("russell's", 1),
  ("hairdresser's", 1),
  ("lucy's", 1),
  ("cadbury's", 1),
  ("water's", 1),
  ("devil's", 1),
  ("venue's", 1),
  ("artist's", 1),
  ("beard's", 1),
  ("germany's", 1),
  ("juliet's", 1),
  ("player's", 1),
  ("torrin's", 1),
  ("hackman's", 1),
  ("photographer's", 1),
  ("madeira's", 1),
  ("monk's", 1),
  ("trinian's", 1),
  ("pont's", 1),
  ("tyler's", 1),
  ("love's", 1),
  ("naani's", 1),
  ("heston's", 1),
  ("mayor's", 1),
  ("scotland's", 1),
  ("chain's", 1),
  ("philip's", 1),
  ("tripper's", 1),
  ("len's", 1),
  ("building's", 1),
  ("byron's", 1),
  ("gear's", 1),
  ("limestone's", 1),
  ("mary's", 1),
  ("asprey's", 1),
  ("workmen's", 1),
  ("snake's", 1),
  ("washington's", 1),
  ("astley's", 1),
  ("smart's", 1),
  ("oakey's", 1),
  ("castle's", 1),
  ("miner's", 1),
  ("kent's", 1),
  ("story's", 1),
  ("mexico's", 1),
  ("collector's", 1),
  ("pm's", 1),
  ("fiction's", 1),
  ("ballard's", 1),
  ("wilson's", 1),
  ("gaulle's", 1),
  ("sony's", 1),
  ("korea's", 1),
  ("auctioneer's", 1),
  ("jessica's", 1),
  ("donkey's", 1),
  ("audrey's", 1),
  ("rodney's", 1),
  ("sharon's", 1),
  ("car's", 1),
  ("relative's", 1),
  ("france's", 1),
  ("bloke's", 1),
  ("catherine's", 1),
  ("merchant's", 1),
  ("kathleen's", 1),
  ("calm's", 1),
  ("rspb's", 1),
  ("viii's", 1),
  ("glitter's", 1),
  ("hartley's", 1),
  ("debbie's", 1),
  ("aim's", 1),
  ("grandma's", 1),
  ("heart's", 1),
  ("bertie's", 1),
  ("saddle's", 1),
  ("firm's", 1),
  ("machine's", 1),
  ("manor's", 1),
  ("ted's", 1),
  ("sunderland's", 1),
  ("cabot's", 1),
  ("tot's", 1),
  ("belfort's", 1),
  ("fisherman's", 1),
  ("half's", 1),
  ("season's", 1),
  ("frost's", 1),
  ("client's", 1),
  ("corvette's", 1),
  ("people've", 1),
  ("publisher's", 1),
  ("cameron's", 1),
  ("where'd", 1),
  ("adrian's", 1),
  ("julie's", 1),
  ("eve's", 1),
  ("clarkson's", 1),
  ("payer's", 1),
  ("hammer's", 1),
  ("hepburn's", 1),
  ("peck's", 1),
  ("evil's", 1),
  ("sandy's", 1),
  ("clare's", 1),
  ("barry's", 1),
  ("hitler's", 1),
  ("leg's", 1),
  ("spock's", 1),
  ("poppy's", 1),
  ("cinema's", 1),
  ("lord's", 1),
  ("morsi's", 1),
  ("incedal's", 1),
  ("now's", 1),
  ("generation's", 1),
  ("community's", 1),
  ("why've", 1),
  ("ben's", 1),
  ("photo's", 1),
  ("grainger's", 1),
  ("evening's", 1),
  ("couple's", 1),
  ("grace's", 1),
  ("store's", 1),
  ("brahms's", 1),
  ("fox's", 1),
  ("wellington's", 1),
  ("forum's", 1),
  ("property's", 1),
  ("bathroom's", 1),
  ("sunday's", 1),
  ("bill's", 1),
  ("crew's", 1),
  ("who'll", 1),
  ("teacher's", 1),
  ("justin's", 1),
  ("there've", 1),
  ("roman's", 1),
  ("dante's", 1),
  ("sailor's", 1),
  ("eva's", 1),
  ("monica's", 1),
  ("jade's", 1),
  ("mar's", 1),
  ("moorcroft's", 1),
  ("jay's", 1),
  ("military's", 1),
  ("hitchhiker's", 1),
  ("pilot's", 1),
  ("duxford's", 1),
  ("veteran's", 1),
  ("ireland's", 1),
  ("tea's", 1),
  ("graham's", 1),
  ("shazia's", 1),
  ("helen's", 1),
  ("bishop's", 1),
  ("beeching's", 1),
  ("might've", 1),
  ("jenny's", 1),
  ("jonathan's", 1),
  ("monday's", 1),
  ("control's", 1),
  ("adele's", 1),
  ("parkinson's", 1),
  ("stephen's", 1),
  ("savile's", 1),
  ("gilding's", 1),
  ("owen's", 1),
  ("professor's", 1),
  ("olympian's", 1),
  ("hodgkin's", 1),
  ("trump's", 1),
  ("eleanor's", 1),
  ("craig's", 1),
  ("alia's", 1),
  ("ram's", 1),
  ("college's", 1),
  ("harrison's", 1),
  ("pat's", 1),
  ("sister's", 1),
  ("practice's", 1),
  ("madonna's", 1),
  ("january's", 1),
  ("museum's", 1),
  ("madge's", 1),
  ("rene's", 1),
  ("reader's", 1),
  ("brian's", 1),
  ("flossy's", 1),
  ("countryfile's", 1),
  ("kevin's", 1),
  ("hubble's", 1),
  ("bang's", 1),
  ("alexander's", 1),
  ("aleksandr's", 1),
  ("moscow's", 1),
  ("harold's", 1),
  ("arctic's", 1),
  ("technology's", 1),
  ("patient's", 1),
  ("cbbc's", 1),
  ("charity's", 1),
  ("dude's", 1),
  ("janet's", 1),
  ("hand's", 1),
  ("dot's", 1),
  ("economy's", 1),
  ("william's", 1),
  ("sian's", 1),
  ("braxton's", 1),
  ("weston's", 1),
  ("tumour's", 1),
  ("gina's", 1),
  ("candidate's", 1),
  ("must've", 1),
  ("madeline's", 1),
  ("diamond's", 1),
  ("hammock's", 1),
  ("polo's", 1),
  ("humanity's", 1),
  ("maxwell's", 1),
  ("university's", 1),
  ("whoever's", 1),
  ("gregg's", 1),
  ("trotsky's", 1)],
 12608)

# Calculate word count for sentences
df['word_count'] = df['sentence'].apply(lambda x: len(x.split()))

# Print statistics on word counts
print(df['word_count'].describe())

# Visualization: Histograms of sentence word counts
plt.hist(df['word_count'], bins=20, alpha=0.7)
plt.title('Word Counts in Sentences')
plt.xlabel('Word Count')
plt.ylabel('Frequency')
plt.show()

count    45839.000000
mean         7.231702
std          3.770229
min          3.000000
25%          4.000000
50%          6.000000
75%          9.000000
max         28.000000
Name: word_count, dtype: float64

# Create inflect engine once
p = inflect.engine()

def convert_numerical_ordinals_to_words(text):
    words = text.split()
    for i, word in enumerate(words):
        # Removing punctuation for better matching
        clean_word = word.rstrip(string.punctuation)
        if match(r'\d+(st|nd|rd|th)', clean_word):
            number = match(r'\d+', clean_word).group()
            word_ordinal = p.number_to_words(int(number), ordinal=True, andword=' ', zero='zero', one='one')
            # Retain the punctuation after conversion
            punctuation = word[len(clean_word):]
            word_ordinal += punctuation
            words[i] = word_ordinal
    return ' '.join(words)

# Convert any numerical ordinals in the sentences to their word form
df['sentence'] = df['sentence'].apply(convert_numerical_ordinals_to_words)

# Display the first few rows to verify the changes
print(df.head())

                                            sentence  word_count
0                  WHEN YOU'RE COOKING CHIPS AT HOME           6
1  THE TRADITIONAL CHIP PAN OFTEN STAYS ON THE SHELF           9
2               THROUGH WHAT THEY CALL A KNIFE BLOCK           7
3         WHICH INVOLVES FIRING A POTATO DOWN A PIPE           8
4  APART FROM THE GOLDEN COLOUR AND THE DELICIOUS...           9

df_original = df.copy(deep=True)

# Backup the sentences before conversion
df['original_sentence'] = df['sentence'].copy()

# Display a few randomly selected original and converted sentences for comparison
sample_sentences = df.sample(10)
for index, row in sample_sentences.iterrows():
    print(f"Original: {row['original_sentence']}")
    print(f"Converted: {row['sentence']}")
    print("------")

# Compute statistics
df['word_count_after_conversion'] = df['sentence'].apply(lambda x: len(x.split()))
print("\nStatistics after conversion:")
print(df['word_count_after_conversion'].describe())

# Visualization: Histograms of sentence lengths after conversion
plt.hist(df['word_count_after_conversion'], bins=20, alpha=0.7, color='blue', label='After Conversion')
plt.hist(df['word_count'], bins=20, alpha=0.7, color='red', label='Before Conversion')
plt.title('Sentence Lengths Comparison')
plt.xlabel('Length (words)')
plt.ylabel('Frequency')
plt.legend()
plt.show()

Original: CAN WE FORGET ABOUT THE PRICE TAG
Converted: CAN WE FORGET ABOUT THE PRICE TAG
------
Original: BUT IT IS A PLEASURE TO SIT THERE AND SEE WHAT TURNS UP
Converted: BUT IT IS A PLEASURE TO SIT THERE AND SEE WHAT TURNS UP
------
Original: MY REAL NAME IS BASIL DEVERE COURTNEY
Converted: MY REAL NAME IS BASIL DEVERE COURTNEY
------
Original: SO FOR EVERY one hundred
Converted: SO FOR EVERY one hundred
------
Original: THEY'RE NOT SECOND HAND OR THIRD HAND
Converted: THEY'RE NOT SECOND HAND OR THIRD HAND
------
Original: THERE IS A RARITY FACTOR
Converted: THERE IS A RARITY FACTOR
------
Original: WHY DON'T WE HAVE A LOOK AT HOW POOR COLIN YOUNG IS GETTING ON WITH THE BLUE TEAM'S BONUS
Converted: WHY DON'T WE HAVE A LOOK AT HOW POOR COLIN YOUNG IS GETTING ON WITH THE BLUE TEAM'S BONUS
------
Original: THE GAME GREW IN POPULARITY
Converted: THE GAME GREW IN POPULARITY
------
Original: AS SOON AS THE DATE WAS ANNOUNCED
Converted: AS SOON AS THE DATE WAS ANNOUNCED
------
Original: I'D SAY fifty TO eighty
Converted: I'D SAY fifty TO eighty
------

Statistics after conversion:
count    45839.000000
mean         7.231702
std          3.770229
min          3.000000
25%          4.000000
50%          6.000000
75%          9.000000
max         28.000000
Name: word_count_after_conversion, dtype: float64

#Create a copy of the dataframe
df_copy = df.copy()

#Compare and create a 'changed' column
df['changed'] = df['sentence'] != df_copy['sentence']

# Obtain statistics
changed_count = df['changed'].sum()
unchanged_count = len(df) - changed_count

print(f"Number of sentences that changed: {changed_count}")
print(f"Number of sentences that remained unchanged: {unchanged_count}")

Number of sentences that changed: 0
Number of sentences that remained unchanged: 45839

# List to store words that were converted
converted_words_list = []

# Iterate through each row of the dataframe
for index, row in df.iterrows():
    original_words = df_copy.loc[index, 'sentence'].split()
    converted_words = row['sentence'].split()

    for orig, conv in zip(original_words, converted_words):
        if orig != conv:
            converted_words_list.append((orig, conv))

# Count the occurrence of each conversion
conversion_counter = Counter(converted_words_list)

# Display the most common conversions
common_conversions = conversion_counter.most_common()

print("Most common word conversions:")
for conversion, count in common_conversions:
    orig, conv = conversion
    print(f"{orig} -> {conv}: {count} times")

Most common word conversions:

# copy df_expanded to df_before_token
df_before_token = df.copy()

print(df.head())

                                            sentence  word_count  \
0                  WHEN YOU'RE COOKING CHIPS AT HOME           6   
1  THE TRADITIONAL CHIP PAN OFTEN STAYS ON THE SHELF           9   
2               THROUGH WHAT THEY CALL A KNIFE BLOCK           7   
3         WHICH INVOLVES FIRING A POTATO DOWN A PIPE           8   
4  APART FROM THE GOLDEN COLOUR AND THE DELICIOUS...           9   

                                   original_sentence  \
0                  WHEN YOU'RE COOKING CHIPS AT HOME   
1  THE TRADITIONAL CHIP PAN OFTEN STAYS ON THE SHELF   
2               THROUGH WHAT THEY CALL A KNIFE BLOCK   
3         WHICH INVOLVES FIRING A POTATO DOWN A PIPE   
4  APART FROM THE GOLDEN COLOUR AND THE DELICIOUS...   

   word_count_after_conversion  changed  
0                            6    False  
1                            9    False  
2                            7    False  
3                            8    False  
4                            9    False

df['sentence'] = df['sentence'].str.lower()
df.head()

df.head()

# Load the CMU Pronunciation Dictionary
pronunciation_dict = cmudict.dict()

# Initialize the g2p converter
g2p = G2p()

def tokenize_and_lowercase_text(text):
    """Tokenize and lowercase text."""
    # Replace newline characters with space
    text = text.replace('\n', ' ')
    # Expand contractions
    text = contractions.fix(text)
    # Handle decades
    text = re.sub(r'(\d+)(s)', r'\1 \2', text)
    # Tokenize text
    tokens = nltk.word_tokenize(text)
    # Lowercase tokens
    tokens = [token.lower() for token in tokens]
    return tokens

def words_to_phonemes(words):
    phonemes = []
    for word in words:
        if word in ['.', ',', '?', '!', ':', ';']:
            phonemes.append('<space>')
        else:
            if word in pronunciation_dict:
                phonemes.extend(pronunciation_dict[word][0])
                phonemes.append('<space>')
            elif word == "'":
                pass
            else:
                phonemes.extend(g2p(word))
                phonemes.append('<space>')
    return phonemes

def process_sentence(sentence):
    try:
        # Tokenize and lowercase text
        tokenized_sentence = tokenize_and_lowercase_text(sentence)

        # Convert words to phonemes
        phonemes = words_to_phonemes(tokenized_sentence)
        phonemes = ['<sos>'] + phonemes[:-1] + ['<eos>']
        return phonemes
    except Exception as e:
        print(f"Error processing sentence: {sentence}")
        print(e)
        return None

def expand_contractions(text):
    """Expand contractions in a text."""
    return contractions.fix(text)

# Expand contractions in the sentence column
df['sentence'] = df['sentence'].apply(expand_contractions)

# Then apply the tokenization and phoneme conversion processes as before
with Pool() as pool:
    df['phonemes'] = pool.map(process_sentence, df['sentence'])

print(df.head())


# Inspect the data
# Check the sentences where the <space> token is not present or is present less frequently than expected
df['word_count'] = df['sentence'].apply(lambda x: len(x.split()))
df['num_spaces'] = df['phonemes'].apply(lambda x: x.count('<space>'))
unusual_sentences = df[df['num_spaces'] < df['word_count'] - 1]
print(unusual_sentences)

                                            sentence  word_count  \
0                 when you are cooking chips at home           6   
1  the traditional chip pan often stays on the shelf           9   
2               through what they call a knife block           7   
3         which involves firing a potato down a pipe           8   
4  apart from the golden colour and the delicious...           9   

                                   original_sentence  \
0                  WHEN YOU'RE COOKING CHIPS AT HOME   
1  THE TRADITIONAL CHIP PAN OFTEN STAYS ON THE SHELF   
2               THROUGH WHAT THEY CALL A KNIFE BLOCK   
3         WHICH INVOLVES FIRING A POTATO DOWN A PIPE   
4  APART FROM THE GOLDEN COLOUR AND THE DELICIOUS...   

   word_count_after_conversion  changed  \
0                            6    False   
1                            9    False   
2                            7    False   
3                            8    False   
4                            9    False   

                                            phonemes  
0  [<sos>, W, EH1, N, <space>, Y, UW1, <space>, A...  
1  [<sos>, DH, AH0, <space>, T, R, AH0, D, IH1, S...  
2  [<sos>, TH, R, UW1, <space>, W, AH1, T, <space...  
3  [<sos>, W, IH1, CH, <space>, IH0, N, V, AA1, L...  
4  [<sos>, AH0, P, AA1, R, T, <space>, F, R, AH1,...  
Empty DataFrame
Columns: [sentence, word_count, original_sentence, word_count_after_conversion, changed, phonemes, num_spaces]
Index: []

# Sample 10 random sentences from the dataset
sample_sentences = df['sentence'].sample(10)
token_counts = [len(tokenize_and_lowercase_text(sentence)) for sentence in sample_sentences]
sentence_counts = [len(sentence.split()) for sentence in sample_sentences]

# Bar Chart
index = range(len(sample_sentences))
bar_width = 0.35
fig, ax = plt.subplots(figsize=(12, 6))
bar1 = ax.bar(index, sentence_counts, bar_width, label='Original Word Count', color='#3498DB', edgecolor='black')
bar2 = ax.bar([i + bar_width for i in index], token_counts, bar_width, label='Tokenized Word Count', color='#E74C3C', edgecolor='black')

ax.set_xlabel('Sentences')
ax.set_ylabel('Word Count')
ax.set_title('Comparison of Word Counts Before and After Tokenization')
ax.set_xticks([i + bar_width for i in index])
ax.set_xticklabels(['Sentence ' + str(i+1) for i in index], rotation=45)
ax.legend()
plt.tight_layout()
plt.show()

# Annotated Text Display
for index, sentence in enumerate(sample_sentences[:2]):
    tokens = tokenize_and_lowercase_text(sentence)
    print(f"Sentences {index+1}:")
    print(f"Original: {sentence}")
    print(f"Tokenized: {tokens}")
    print("-"*100)

Sentences 1:
Original: that is always fascinated me
Tokenized: ['that', 'is', 'always', 'fascinated', 'me']
----------------------------------------------------------------------------------------------------
Sentences 2:
Original: which means the light comes from hot
Tokenized: ['which', 'means', 'the', 'light', 'comes', 'from', 'hot']
----------------------------------------------------------------------------------------------------

# Convert the list of phonemes in the 'phonemes' column to a space-separated string
df['phonemes_str'] = df['phonemes'].str.join(' ')

# Create a function to display the dataframe without truncation
def display_full_dataframe(dataframe):
    with pd.option_context('display.max_rows', None, 'display.max_columns', None, 'display.max_colwidth', None):
        display(dataframe)

# Use the function to display the first 5 rows
display_full_dataframe(df[["phonemes_str"]].head())

df.head()

# Remove sentences that contain ' or space in the phonemes
df = df[~df['phonemes'].apply(lambda x: "'" in x or ' ' in x)]

# Create a dictionary for removing stress markers
remove_stress_dict = {str(i): '' for i in range(10)}

def remove_stress(phonemes):
    """Remove stress markers from a list of phonemes."""
    return [re.sub(r'\d', '', phoneme) for phoneme in phonemes]

def add_special_tokens(sentence):
    """Add special tokens to a sentence."""
    return '<sos> ' + sentence.replace(' ', ' <space> ') + ' <eos>'

# Apply the function to the sentence column
df['sentence_with_tokens'] = df['sentence'].apply(add_special_tokens)

print(df[['sentence', 'sentence_with_tokens', 'phonemes']].sample(10))

# Apply the processing function
df['phonemes'] = df['phonemes'].apply(remove_stress)

# Sample Inspection
print(df[['sentence', 'phonemes']].sample(10))

# Distribution Analysis
df['phoneme_count'] = df['phonemes'].str.len()
print(df['phoneme_count'].describe())

# Special Tokens Check
wrong_start = df[df['phonemes'].str[0] != "<sos>"]
wrong_end = df[df['phonemes'].str[-1] != "<eos>"]
print(f"Number of sequences with wrong start: {len(wrong_start)}")
print(f"Number of sequences with wrong end: {len(wrong_end)}")

# Check for None values
none_sentences = df[df['phonemes'].apply(lambda x: None in x)]
print(f"Number of sentences with None values: {len(none_sentences)}")

# Frequency Analysis
all_phonemes = list(chain.from_iterable(df['phonemes']))
phoneme_freq = Counter(all_phonemes)
print("Most common phonemes:", phoneme_freq.most_common(10))
print("Least common phonemes:", phoneme_freq.most_common()[:-11:-1])

# Check if there are any missing phonemes
missing_phonemes = df[df['phonemes'].apply(lambda x: None in x)]
print(f"Number of sentences with missing phonemes: {len(missing_phonemes)}")

space_sentences = df[df['phonemes'].apply(lambda x: ' ' in x)]
print(space_sentences[['sentence', 'phonemes']])

<ipython-input-50-a357df83680a>:13: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['sentence_with_tokens'] = df['sentence'].apply(add_special_tokens)

                                                sentence  \
12522                   tried to break it out of my hand   
12508                           do not you call a doctor   
16858                         we have got what we wanted   
18655  two of our best teams will be competing agains...   
1100    if you had clarissa beside you on the barricades   
43334                                  you brought it in   
40657                          the dead do not come back   
11112                           by the famous dambusters   
25708  if you clear your plate of sandwiches they kee...   
17329                                  how are you today   

                                    sentence_with_tokens  \
12522  <sos> tried <space> to <space> break <space> i...   
12508  <sos> do <space> not <space> you <space> call ...   
16858  <sos> we <space> have <space> got <space> what...   
18655  <sos> two <space> of <space> our <space> best ...   
1100   <sos> if <space> you <space> had <space> clari...   
43334  <sos> you <space> brought <space> it <space> i...   
40657  <sos> the <space> dead <space> do <space> not ...   
11112  <sos> by <space> the <space> famous <space> da...   
25708  <sos> if <space> you <space> clear <space> you...   
17329  <sos> how <space> are <space> you <space> toda...   

                                                phonemes  
12522  [<sos>, T, R, AY1, D, <space>, T, UW1, <space>...  
12508  [<sos>, D, UW1, <space>, N, AA1, T, <space>, Y...  
16858  [<sos>, W, IY1, <space>, HH, AE1, V, <space>, ...  
18655  [<sos>, T, UW1, <space>, AH1, V, <space>, AW1,...  
1100   [<sos>, IH1, F, <space>, Y, UW1, <space>, HH, ...  
43334  [<sos>, Y, UW1, <space>, B, R, AO1, T, <space>...  
40657  [<sos>, DH, AH0, <space>, D, EH1, D, <space>, ...  
11112  [<sos>, B, AY1, <space>, DH, AH0, <space>, F, ...  
25708  [<sos>, IH1, F, <space>, Y, UW1, <space>, K, L...  
17329  [<sos>, HH, AW1, <space>, AA1, R, <space>, Y, ...  
                                                sentence  \
21948  when you can actually hold the fabric that the...   
4958   there is something about them which makes us f...   
18610                         bills of sale and receipts   
43467                          look out for the qr codes   
30773         most influential figures in british comedy   
7102   i cannot wait to see it and you can find out h...   
14877  we are pinning all our hopes on the man with t...   
5095                            there is lots of smaller   
25014                   why do not you do the power test   
38100                                 on that assumption   

                                                phonemes  
21948  [<sos>, W, EH, N, <space>, Y, UW, <space>, K, ...  
4958   [<sos>, DH, EH, R, <space>, IH, Z, <space>, S,...  
18610  [<sos>, B, IH, L, Z, <space>, AH, V, <space>, ...  
43467  [<sos>, L, UH, K, <space>, AW, T, <space>, F, ...  
30773  [<sos>, M, OW, S, T, <space>, IH, N, F, L, UW,...  
7102   [<sos>, AY, <space>, K, AE, N, <space>, N, AA,...  
14877  [<sos>, W, IY, <space>, AA, R, <space>, P, IH,...  
5095   [<sos>, DH, EH, R, <space>, IH, Z, <space>, L,...  
25014  [<sos>, W, AY, <space>, D, UW, <space>, N, AA,...  
38100  [<sos>, AA, N, <space>, DH, AE, T, <space>, AH...  
count    45814.000000
mean        34.139040
std         17.523979
min         11.000000
25%         21.000000
50%         29.000000
75%         42.000000
max        141.000000
Name: phoneme_count, dtype: float64
Number of sequences with wrong start: 0
Number of sequences with wrong end: 0
Number of sentences with None values: 0
Most common phonemes: [('<space>', 299529), ('AH', 111029), ('T', 91599), ('N', 77726), ('IH', 75183), ('R', 52083), ('S', 50329), ('D', 47510), ('<sos>', 45814), ('<eos>', 45814)]
Least common phonemes: [('ZH', 444), ('OY', 1151), ('UH', 5864), ('JH', 6134), ('CH', 6196), ('TH', 6864), ('SH', 7392), ('AW', 8615), ('Y', 11279), ('G', 11628)]
Number of sentences with missing phonemes: 0
Empty DataFrame
Columns: [sentence, phonemes]
Index: []

# Add tokens to sentences for comparison
def add_tokens_to_sentence(sentence):
    return '<sos> ' + sentence.replace(' ', ' <space> ') + ' <eos>'

df['tokenized_sentence'] = df['sentence'].apply(add_tokens_to_sentence)

# Convert the list of phonemes to a space-separated string for display
df['phonemes_str'] = df['phonemes'].apply(lambda x: ' '.join(x))

# Display the tokenized sentences and their corresponding phonemes
sample_comparison = df[['tokenized_sentence', 'phonemes_str']].sample(5)

with pd.option_context('display.max_rows', None, 'display.max_colwidth', None):
    display(sample_comparison)

def check_consecutive_special_tokens(sentence_sequences, phoneme_sequences):
    special_tokens = ['<eos>', '<sos>', '<space>']
    for seq in sentence_sequences:
        for token in special_tokens:
            if f"{token} {token}" in seq:
                print(f"Consecutive {token} found in sentence: {seq}")
    for seq in phoneme_sequences:
        for token in special_tokens:
            if f"{token} {token}" in ' '.join(seq):
                print(f"Consecutive {token} found in phoneme: {' '.join(seq)}")

# Example usage:
check_consecutive_special_tokens(df['sentence_with_tokens'], df['phonemes'])

Consecutive <space> found in phoneme: <sos> DH AH <space> AA R CH ER Z <space> L AO S T <space> IH T S <space> EH JH AH K EY SH AH N AH L <space> P ER P AH S <space> IH N <space> W AH N <space> TH AW Z AH N D <space> <space> N AY N <space> HH AH N D R AH D <space> AH N D <space> S EH V AH N T IY <space> T UW <eos>
Consecutive <space> found in phoneme: <sos> W IY <space> AA R <space> K AA N S AH N T R EY T IH NG <space> AA N <space> W AH N <space> TH AW Z AH N D <space> <space> N AY N <space> HH AH N D R AH D <space> AH N D <space> S IH K S T IY <space> F AO R <space> AH N D <space> W AH N <space> TH AW Z AH N D <space> <space> N AY N <space> HH AH N D R AH D <space> AH N D <space> S IH K S T IY <space> F AY V <space> AH N D <space> HH IY R IH NG <space> S AH M <space> M AO R <space> P ER S IH N IH L <space> S T AO R IY Z <space> AH V <eos>
Consecutive <space> found in phoneme: <sos> L EH T <space> AH S <space> T EY K <space> AH <space> L UH K <space> AE T <space> S AH M <space> AH V <space> DH AH <space> AH DH ER <space> N UW Z <space> HH EH D L AY N Z <space> IH N <space> W AH N <space> TH AW Z AH N D <space> <space> N AY N <space> HH AH N D R AH D <space> AH N D <space> S IH K S T IY <space> S IH K S <space> AH N D <space> S IH K S T IY <space> S EH V AH N <space> AH L AO NG <space> W IH DH <space> S AH M <space> AH V <space> DH AH <space> M Y UW Z IH K <eos>
Consecutive <space> found in phoneme: <sos> F OW K AH S IH NG <space> AA N <space> DH AH <space> IH V EH N T S <space> IH N <space> W AH N <space> TH AW Z AH N D <space> <space> N AY N <space> HH AH N D R AH D <space> AH N D <space> S IH K S T IY <space> EY T <space> AH N D <space> W AH N <space> TH AW Z AH N D <space> <space> N AY N <space> HH AH N D R AH D <space> AH N D <space> S IH K S T IY <space> N AY N <eos>
Consecutive <space> found in phoneme: <sos> HH UW <space> D AY D <space> IH N <space> W AH N <space> TH AW Z AH N D <space> <space> N AY N <space> HH AH N D R AH D <space> AH N D <space> F AO R T IY <eos>
Consecutive <space> found in phoneme: <sos> B IY IH NG <space> R AH S IY V D <space> IH N <space> W AH N <space> TH AW Z AH N D <space> <space> S EH V AH N <space> HH AH N D R AH D <space> AH N D <space> TH ER D IY <space> F AO R <space> AH N D <space> M AE N Y AH F AE K CH ER D <space> DH AE T <space> Y IH R <eos>
Consecutive <space> found in phoneme: <sos> IH T <space> IH Z <space> B IH N <space> P R AH D UW S IH NG <space> L OW K AH L <space> EY L <space> S IH N S <space> AE T <space> L IY S T <space> W AH N <space> TH AW Z AH N D <space> <space> S IH K S <space> HH AH N D R AH D <space> AH N D <space> F AO R T IY <space> T UW <eos>
Consecutive <space> found in phoneme: <sos> B AH T <space> IH N <space> F AE K T <space> IH N <space> W AH N <space> TH AW Z AH N D <space> <space> F AY V <space> HH AH N D R AH D <space> AH N D <space> T W EH N T IY <space> S EH V AH N <eos>
Consecutive <space> found in phoneme: <sos> T UW <space> DH AH <space> T AY M <space> AH V <space> HH IH Z <space> D EH TH <space> IH N <space> W AH N <space> TH AW Z AH N D <space> <space> N AY N <space> HH AH N D R AH D <eos>
Consecutive <space> found in phoneme: <sos> AY <space> HH AE V <space> B IH N <space> AH <space> V EH JH AH T EH R IY AH N <space> S IH N S <space> W AH N <space> TH AW Z AH N D <space> <space> N AY N <space> HH AH N D R AH D <space> AH N D <space> S EH V AH N T IY <space> N AY N <eos>
Consecutive <space> found in phoneme: <sos> AH <space> V AH N IY SH AH N <space> HH UW <space> K EY M <space> HH IY R <space> B IH T W IY N <space> AH B AW T <space> W AH N <space> TH AW Z AH N D <space> <space> S EH V AH N <space> HH AH N D R AH D <space> AH N D <space> N AY N <space> AH N D <space> W AH N <space> TH AW Z AH N D <space> <space> S EH V AH N <space> HH AH N D R AH D <space> AH N D <space> T EH N <eos>
Consecutive <space> found in phoneme: <sos> DH AH <space> AA R CH ER Z <space> F R AH M <space> W AH N <space> TH AW Z AH N D <space> <space> N AY N <space> HH AH N D R AH D <space> AH N D <space> EY T IY <space> T UW <space> DH AH <space> P R EH Z AH N T <space> D EY <eos>
Consecutive <space> found in phoneme: <sos> AW ER <space> F ER S T <space> W AA Z <space> CH OW Z AH N <space> F AO R <space> HH ER <space> M AE JH AH S T IY <space> EH S <space> W EH D IH NG <space> T UW <space> P R IH N S <space> F IH L AH P <space> B AE K <space> IH N <space> W AH N <space> TH AW Z AH N D <space> <space> N AY N <space> HH AH N D R AH D <space> AH N D <space> F AO R T IY <space> S EH V AH N <eos>
Consecutive <space> found in phoneme: <sos> AH N D <space> B AY <space> DH AH <space> T AY M <space> AH V <space> HH IH Z <space> D EH TH <space> IH N <space> W AH N <space> TH AW Z AH N D <space> <space> EY T <space> HH AH N D R AH D <space> AH N D <space> TH ER D IY <space> F AY V <eos>
Consecutive <space> found in phoneme: <sos> IH T <space> W AA Z <space> M EY D <space> IH N <space> AH B AW T <space> W AH N <space> TH AW Z AH N D <space> <space> N AY N <space> HH AH N D R AH D <space> AH N D <space> TH ER D IY <space> F AY V <eos>
Consecutive <space> found in phoneme: <sos> W EH N <space> DH AH <space> R EY L W EY <space> S T EY SH AH N <space> W AA Z <space> IH N AO G ER EY T IH D <space> HH IY R <space> IH N <space> AA G AH S T <space> W AH N <space> TH AW Z AH N D <space> <space> EY T <space> HH AH N D R AH D <space> AH N D <space> F IH F T IY <space> W AH N <eos>
Consecutive <space> found in phoneme: <sos> P R IY S IH ZH AH N <space> S T R AY K <space> W AA Z <space> W AH T <space> DH AH <space> D AE M B AH S T ER Z <space> W ER <space> AH B AW T <space> IH N <space> W AH N <space> TH AW Z AH N D <space> <space> N AY N <space> HH AH N D R AH D <space> AH N D <space> F AO R T IY <space> TH R IY <space> AH N D <space> DH AE T <space> IH Z <space> V EH R IY <space> M AH CH <space> DH AH <space> S EY M <space> T AH D EY <eos>
Consecutive <space> found in phoneme: <sos> W EH N <space> HH EH N R IY <space> N EH L T <space> T UW <space> M EY K <space> HH IH Z <space> W EH D IH NG <space> V AW Z <space> T UW <space> AE N <space> B OW L IH N <space> IH N <space> JH AE N Y UW EH R IY <space> W AH N <space> TH AW Z AH N D <space> <space> F AY V <space> HH AH N D R AH D <space> AH N D <space> TH ER D IY <space> TH R IY <eos>
Consecutive <space> found in phoneme: <sos> IH N <space> AA K T OW B ER <space> W AH N <space> TH AW Z AH N D <space> <space> F AY V <space> HH AH N D R AH D <space> AH N D <space> TH ER D IY <space> S EH V AH N <eos>
Consecutive <space> found in phoneme: <sos> B AH T <space> AY <space> TH IH NG K <space> IH T <space> IH Z <space> AH <space> W AH N D ER F AH L <space> IY V OW K EY SH AH N <space> AH V <space> DH AH <space> HH AA R D <space> T AY M Z <space> IH N <space> W AH N <space> TH AW Z AH N D <space> <space> N AY N <space> HH AH N D R AH D <space> AH N D <space> F AO R T IY <eos>
Consecutive <space> found in phoneme: <sos> DH AE T <space> HH AE D <space> AH <space> AH K <space> T AA P <space> T EH N <space> S IH NG G AH L <space> IH N <space> W AH N <space> TH AW Z AH N D <space> <space> N AY N <space> HH AH N D R AH D <space> AH N D <space> N AY N T IY <space> F AY V <space> W IH DH <space> T ER N <space> AA N <eos>
Consecutive <space> found in phoneme: <sos> F ER S T <space> P ER F AO R M D <space> IH N <space> W AH N <space> TH AW Z AH N D <space> <space> EY T <space> HH AH N D R AH D <space> AH N D <space> F AO R T IY <space> S IH K S <eos>
Consecutive <space> found in phoneme: <sos> HH IY R <space> IH Z <space> AH <space> R IH L IY <space> G UH D <space> W EY <space> T UW <space> D IY L <space> W IH DH <space> DH EH M <space> IH N <space> DH AH <space> W AH N <space> TH AW Z AH N D <space> <space> N AY N <space> HH AH N D R AH D <space> AH N D <space> EY T IY <space> F AO R <space> G AA R D AH N <eos>
Consecutive <space> found in phoneme: <sos> W IH CH <space> AY <space> AE M <space> G EH S IH NG <space> IH Z <space> S AH M TH IH NG <space> B IH T W IY N <space> W AH N <space> TH AW Z AH N D <space> <space> EY T <space> HH AH N D R AH D <space> AH N D <space> T W EH N T IY <eos>
Consecutive <space> found in phoneme: <sos> HH IY <space> D AY D <space> IH N <space> W AH N <space> TH AW Z AH N D <space> <space> S EH V AH N <space> HH AH N D R AH D <space> AH N D <space> F IH F T IY <space> F AO R <eos>
Consecutive <space> found in phoneme: <sos> DH EY <space> M EH R IY D <space> IH N <space> W AH N <space> TH AW Z AH N D <space> <space> EY T <space> HH AH N D R AH D <space> AH N D <space> N AY N T IY <space> TH R IY <eos>
Consecutive <space> found in phoneme: <sos> AH N D <space> HH IY <space> R IH T AY R D <space> IH N <space> AH B AW T <space> W AH N <space> TH AW Z AH N D <space> <space> N AY N <space> HH AH N D R AH D <space> AH N D <space> N AY N T IY <space> W AH N <eos>
Consecutive <space> found in phoneme: <sos> M AH Z UH R IY <space> IH N <space> W AH N <space> TH AW Z AH N D <space> <space> N AY N <space> HH AH N D R AH D <space> AH N D <space> F AO R T IY <space> S IH K S <eos>
Consecutive <space> found in phoneme: <sos> G AY <space> G AA T <space> HH IH Z <space> N EY M <space> W EH N <space> HH IY <space> ER AY V D <space> AE T <space> DH AH <space> Z UW <space> AA N <space> G AY <space> F AO K S <space> D EY <space> IH N <space> W AH N <space> TH AW Z AH N D <space> <space> N AY N <space> HH AH N D R AH D <space> AH N D <space> F AO R T IY <space> S EH V AH N <eos>
Consecutive <space> found in phoneme: <sos> K IH L ER T AH N <space> HH AW S <space> W AA Z <space> B IH L T <space> IH N <space> W AH N <space> TH AW Z AH N D <space> <space> S EH V AH N <space> HH AH N D R AH D <space> AH N D <space> S EH V AH N T IY <space> N AY N <space> F AO R <space> W AH N <space> AH V <space> D EH V AH N <space> EH S <space> OW L D AH S T <space> F AE M AH L IY Z <eos>
Consecutive <space> found in phoneme: <sos> K AH M IH NG <space> AH P <space> T UW <space> W AH N <space> TH AW Z AH N D <space> <space> EY T <space> HH AH N D R AH D <space> AH N D <space> AH <space> B IH T <space> B IH AA N D <eos>
Consecutive <space> found in phoneme: <sos> IH N <space> ER AW N D <space> AH B AW T <space> DH AH <space> Y IH R <space> W AH N <space> TH AW Z AH N D <space> <space> N AY N <space> HH AH N D R AH D <space> AH N D <space> T EH N <eos>
Consecutive <space> found in phoneme: <sos> AH N D <space> HH IY <space> W AA Z <space> B AO R N <space> IH N <space> W AH N <space> TH AW Z AH N D <space> <space> EY T <space> HH AH N D R AH D <space> AH N D <space> EY T IY <space> EY T <eos>
Consecutive <space> found in phoneme: <sos> IH N <space> DH AH <space> W IH N T ER <space> AH V <space> W AH N <space> TH AW Z AH N D <space> <space> N AY N <space> HH AH N D R AH D <space> AH N D <space> F IH F T IY <space> EY T <eos>
Consecutive <space> found in phoneme: <sos> W IH CH <space> IH T <space> D IH D <space> AH N T IH L <space> W AH N <space> TH AW Z AH N D <space> <space> EY T <space> HH AH N D R AH D <space> AH N D <space> N AY N T IY <space> N AY N <eos>
Consecutive <space> found in phoneme: <sos> DH AH <space> L UW S AH T EY N IY AH <space> W AH N <space> DH AH <space> B L UW <space> R AY B AH N D <space> IH N <space> W AH N <space> TH AW Z AH N D <space> <space> N AY N <space> HH AH N D R AH D <space> AH N D <space> S EH V AH N <eos>
Consecutive <space> found in phoneme: <sos> P AH B L IH SH T <space> IH N <space> W AH N <space> TH AW Z AH N D <space> <space> N AY N <space> HH AH N D R AH D <space> AH N D <space> W AH N <space> AH N D <space> K AO L D <space> S IH M P L IY <space> P AA V ER T IY <eos>
Consecutive <space> found in phoneme: <sos> AY <space> HH AE V <space> B IH N <space> AE N <space> AE K T R AH S <space> S IH N S <space> W AH N <space> TH AW Z AH N D <space> <space> N AY N <space> HH AH N D R AH D <space> AH N D <space> EY T IY <space> F AY V <eos>
Consecutive <space> found in phoneme: <sos> IH T <space> W AA Z <space> K AE S T <space> IH N <space> L AO B ER OW <space> IH N <space> W AH N <space> TH AW Z AH N D <space> <space> N AY N <space> HH AH N D R AH D <space> AH N D <space> T W EH N T IY <space> F AO R <eos>
Consecutive <space> found in phoneme: <sos> DH EH R <space> W AA Z <space> L EH S <space> F AO R AH S T <space> IH N <space> W AH N <space> TH AW Z AH N D <space> <space> N AY N <space> HH AH N D R AH D <space> DH AE N <space> DH EH R <space> HH AE D <space> B IH N <space> F AO R <space> T EH N <eos>
Consecutive <space> found in phoneme: <sos> IH T <space> HH AE Z <space> B IH N <space> L EY D <space> AH P <space> S IH N S <space> IH T <space> B R OW K <space> D AW N <space> IH N <space> W AH N <space> TH AW Z AH N D <space> <space> N AY N <space> HH AH N D R AH D <space> AH N D <space> TH ER D IY <space> F AO R <eos>
Consecutive <space> found in phoneme: <sos> AY <space> W AA Z <space> L AH K IY <space> IH N AH F <space> T UW <space> S IH T <space> W IH DH <space> AH <space> G AY <space> HH UW <space> W AH N <space> DH AH <space> F ER S T <space> B R IH T IH SH <space> R AE L IY <space> CH AE M P IY AH N SH IH P <space> IH N <space> W AH N <space> TH AW Z AH N D <space> <space> N AY N <space> HH AH N D R AH D <space> AH N D <space> F IH F T IY <space> EY T <eos>
Consecutive <space> found in phoneme: <sos> IH T <space> M EY D <space> AH <space> R IY L <space> IH M P R EH SH AH N <space> AA N <space> K AE P T AH N <space> K UH K <space> W EH N <space> HH IY <space> K EY M <space> HH IY R <space> IH N <space> W AH N <space> TH AW Z AH N D <space> <space> S EH V AH N <space> HH AH N D R AH D <space> AH N D <space> S EH V AH N T IY <eos>
Consecutive <space> found in phoneme: <sos> S OW <space> F R AH M <space> W AH N <space> TH AW Z AH N D <space> <space> N AY N <space> HH AH N D R AH D <space> AH N D <space> F AO R T IY <space> T UW <space> AA N W ER D Z <eos>
Consecutive <space> found in phoneme: <sos> DH AE T <space> IH Z <space> W EH R <space> AY <space> S T AA R T AH D <space> M AY <space> B IY B IY S IY <space> K ER IH R <space> B AE K <space> IH N <space> W AH N <space> TH AW Z AH N D <space> <space> N AY N <space> HH AH N D R AH D <space> AH N D <space> N AY N T IY <space> TH R IY <eos>
Consecutive <space> found in phoneme: <sos> IH T <space> F ER S T <space> AH P IH R D <space> AA N <space> B IY B IY S IY <space> T UW <space> IH N <space> W AH N <space> TH AW Z AH N D <space> <space> N AY N <space> HH AH N D R AH D <space> AH N D <space> S EH V AH N T IY <space> EY T <eos>
Consecutive <space> found in phoneme: <sos> R AY T <space> TH R UW <space> T UW <space> AH B AW T <space> W AH N <space> TH AW Z AH N D <space> <space> EY T <space> HH AH N D R AH D <space> AH N D <space> F IH F T IY <eos>
Consecutive <space> found in phoneme: <sos> DH AH <space> ER IH JH AH N AH L <space> W AH N <space> W AA Z <space> N AA K T <space> D AW N <space> T UW <space> B IY <space> R IY P L EY S T <space> B AY <space> DH IH S <space> W AH N <space> IH N <space> W AH N <space> TH AW Z AH N D <space> <space> EY T <space> HH AH N D R AH D <space> AH N D <space> EY T IY <space> S EH V AH N <space> AH N D <space> W AH T <space> IH Z <space> IY V IH N <space> M AO R <eos>
Consecutive <space> found in phoneme: <sos> B IH K AO Z <space> W EH N <space> SH IY <space> D AY D <space> IH N <space> W AH N <space> TH AW Z AH N D <space> <space> EY T <space> HH AH N D R AH D <space> AH N D <space> F AO R T IY N <eos>
Consecutive <space> found in phoneme: <sos> HH UW <space> W AA Z <space> B AO R N <space> IH N <space> W AH N <space> TH AW Z AH N D <space> <space> N AY N <space> HH AH N D R AH D <space> AH N D <space> F AO R T IY <space> EY T <eos>
Consecutive <space> found in phoneme: <sos> AE N <space> OW L D <space> S IH N AH G AO G <space> D EY T IH NG <space> F R AH M <space> W AH N <space> TH AW Z AH N D <space> <space> TH R IY <space> HH AH N D R AH D <space> AH N D <space> TH ER D IY <space> EY T <eos>
Consecutive <space> found in phoneme: <sos> DH AH <space> R IY L <space> L AE S T <space> IH N V EY ZH AH N <space> AE K CH AH W AH L IY <space> K EY M <space> HH IY R <space> IH N <space> W AH N <space> TH AW Z AH N D <space> <space> F AY V <space> HH AH N D R AH D <space> AH N D <space> F AO R T IY <space> F AY V <eos>
Consecutive <space> found in phoneme: <sos> P AH L IH T AH K AH L <space> P R EH SH ER <space> L EH D <space> T UW <space> DH AH <space> P AE S AH JH <space> AH V <space> DH AH <space> W AH N <space> TH AW Z AH N D <space> <space> N AY N <space> HH AH N D R AH D <space> AH N D <space> F AY V <space> EY L IY AH N Z <space> AE K T <eos>
Consecutive <space> found in phoneme: <sos> AH N D <space> S EH T <space> AH G EH N S T <space> DH AH <space> B AE K D R AA P <space> AH V <space> DH AH <space> W AH N <space> TH AW Z AH N D <space> <space> N AY N <space> HH AH N D R AH D <space> AH N D <space> EY T IY <space> F AO R <space> M AY N ER Z <space> S T R AY K <eos>
Consecutive <space> found in phoneme: <sos> IH N D IY D <space> B IH T W IY N <space> W AH N <space> TH AW Z AH N D <space> <space> N AY N <space> HH AH N D R AH D <space> AH N D <space> F AO R T IY <space> F AY V <space> AH N D <space> W AH N <space> TH AW Z AH N D <space> <space> N AY N <space> HH AH N D R AH D <space> AH N D <space> S IH K S T IY <space> F AY V <space> IH T <space> IH Z <space> F EH R <space> T UW <space> S EY <space> DH AE T <space> HH IY <space> K AH N T R IH B Y UW T IH D <space> M AO R <eos>
Consecutive <space> found in phoneme: <sos> B IH L D IH NG <space> W ER K <space> S T AA R T AH D <space> IH N <space> W AH N <space> TH AW Z AH N D <space> <space> S EH V AH N <space> HH AH N D R AH D <space> AH N D <space> TH ER D IY <space> N AY N <eos>
Consecutive <space> found in phoneme: <sos> DH AH <space> HH AW S <space> W AA Z <space> B IH L T <space> IH N <space> W AH N <space> TH AW Z AH N D <space> <space> S IH K S <space> HH AH N D R AH D <space> AH N D <space> S IH K S <eos>
Consecutive <space> found in phoneme: <sos> AH <space> P AH B <space> CH EY N <space> EH S <space> B IH N <space> K R IH T AH S AY Z D <space> F AO R <space> DH IH S <space> D AH B AH L <space> D OW N AH T <space> B ER G ER <space> W IH DH <space> W AH N <space> TH AW Z AH N D <space> <space> N AY N <space> HH AH N D R AH D <space> AH N D <space> N AY N T IY <space> S IH K S <eos>
Consecutive <space> found in phoneme: <sos> AH <space> K AA P IY <space> AH V <space> AH <space> M Y UW T ER S AY K AH N G <space> M AE G AH Z IY N <space> F R AH M <space> W AH N <space> TH AW Z AH N D <space> <space> N AY N <space> HH AH N D R AH D <space> AH N D <space> S EH V AH N T IY <space> S IH K S <eos>
Consecutive <space> found in phoneme: <sos> M AO R <space> R IY S AH N T L IY <space> IH N <space> W AH N <space> TH AW Z AH N D <space> <space> N AY N <space> HH AH N D R AH D <space> AH N D <space> F AO R T IY <space> N AY N <eos>
Consecutive <space> found in phoneme: <sos> EH V ER <space> S IH N S <space> IH T S <space> F ER S T <space> AH P IH R AH N S <space> IH N <space> AH <space> B L AE K <space> AH N D <space> W AY T <space> S K R AE P Y AA R D <space> IH N <space> W AH N <space> TH AW Z AH N D <space> <space> N AY N <space> HH AH N D R AH D <space> AH N D <space> S IH K S T IY <space> TH R IY <eos>
Consecutive <space> found in phoneme: <sos> AH N D <space> IH N <space> D IH S EH M B ER <space> W AH N <space> TH AW Z AH N D <space> <space> F AY V <space> HH AH N D R AH D <space> AH N D <space> T W EH N T IY <space> S EH V AH N <eos>
Consecutive <space> found in phoneme: <sos> OW V ER <space> S IH K S <space> D EY Z <space> IH N <space> W AH N <space> TH AW Z AH N D <space> <space> N AY N <space> HH AH N D R AH D <space> AH N D <space> F IH F T IY <space> EY T <eos>
Consecutive <space> found in phoneme: <sos> S AH M <space> W AH N <space> TH AW Z AH N D <space> <space> EY T <space> HH AH N D R AH D <space> Y IH R Z <space> AH G OW <eos>
Consecutive <space> found in phoneme: <sos> AH N D <space> DH IH S <space> M EH G AH F OW N <space> D EY T S <space> F R AH M <space> W AH N <space> TH AW Z AH N D <space> <space> N AY N <space> HH AH N D R AH D <space> AH N D <space> S IH K S T IY <space> TH R IY <eos>
Consecutive <space> found in phoneme: <sos> T UW <space> TH AW Z AH N D <space> AH N D <space> T EH N <space> W IY <space> IH N HH EH R AH T IH D <space> DH AH <space> L OW AH S T <space> L EH V AH L <space> AH V <space> B IH L D IH NG Z <space> S IH N S <space> W AH N <space> TH AW Z AH N D <space> <space> N AY N <space> HH AH N D R AH D <space> AH N D <space> T W EH N T IY <space> TH R IY <eos>
Consecutive <space> found in phoneme: <sos> DH AH <space> F ER S T <space> R EH F ER AH N S <space> T UW <space> DH AH <space> B AO R AH S T OW N <space> IH Z <space> F R AH M <space> W AH N <space> TH AW Z AH N D <space> <space> S EH V AH N <space> HH AH N D R AH D <space> AH N D <space> T W EH N T IY <space> TH R IY <eos>
Consecutive <space> found in phoneme: <sos> HH UW <space> W AH N <space> AH <space> B EH S T <space> AE K T R AH S <space> AO S K ER <space> IH N <space> W AH N <space> TH AW Z AH N D <space> <space> N AY N <space> HH AH N D R AH D <space> AH N D <space> EY T IY <space> EY T <space> F AO R <space> DH AH <space> F IH L M <space> M UW N S T R AH K <eos>
Consecutive <space> found in phoneme: <sos> W EY <space> B AE K <space> IH N <space> W AH N <space> TH AW Z AH N D <space> <space> N AY N <space> HH AH N D R AH D <space> AH N D <space> N AY N T IY <space> W AH N <eos>
Consecutive <space> found in phoneme: <sos> HH UW <space> W AA Z <space> IH M P L IH K EY T IH D <space> IH N <space> DH AH <space> G AH N P AW D ER <space> P L AA T <space> IH N <space> W AH N <space> TH AW Z AH N D <space> <space> S IH K S <space> HH AH N D R AH D <space> AH N D <space> F AY V <eos>
Consecutive <space> found in phoneme: <sos> DH AH <space> S T AA R <space> AH V <space> DH AH <space> S T AA R <space> T R EH K <space> S IH R IY Z <space> AH N D <space> F IH L M Z <space> B IY M D <space> D AW N <space> T UW <space> DH AH <space> W OW G AH N <space> S T UW D IY OW <space> IH N <space> W AH N <space> TH AW Z AH N D <space> <space> N AY N <space> HH AH N D R AH D <space> AH N D <space> EY T IY <space> N AY N <eos>
Consecutive <space> found in phoneme: <sos> W EH N <space> B EH T IY <space> D EY V AH S <space> K EY M <space> AA N <space> DH AH <space> SH OW <space> B AE K <space> IH N <space> W AH N <space> TH AW Z AH N D <space> <space> N AY N <space> HH AH N D R AH D <space> AH N D <space> EY T IY <space> S EH V AH N <eos>
Consecutive <space> found in phoneme: <sos> IH N <space> W IH CH <space> AY <space> F L AY <space> AH <space> W AH N <space> TH AW Z AH N D <space> <space> N AY N <space> HH AH N D R AH D <space> AH N D <space> T W EH N T IY <space> S IH K S <space> S T IH R M AH N <eos>
Consecutive <space> found in phoneme: <sos> IH N <space> W AH N <space> TH AW Z AH N D <space> <space> EY T <space> HH AH N D R AH D <space> AH N D <space> T W EH N T IY <space> EY T <space> DH AE T <space> DH AH <space> F ER S T <space> F R EH N CH <space> AH N Y AH N <space> S EH L ER <space> D IH S AY D IH D <space> T UW <space> T R AY <space> HH IH Z <space> L AH K <space> AH N D <space> K R AO S <space> DH AH <eos>
Consecutive <space> found in phoneme: <sos> AH K AO R D IH NG <space> T UW <space> DH AH <space> W AH N <space> TH AW Z AH N D <space> <space> N AY N <space> HH AH N D R AH D <space> AH N D <space> IH L EH V AH N <space> S EH N S AH S <eos>
Consecutive <space> found in phoneme: <sos> IH N <space> W AH N <space> TH AW Z AH N D <space> <space> EY T <space> HH AH N D R AH D <space> AH N D <space> T W EH N T IY <space> EY T <space> DH AE T <space> DH AH <space> F ER S T <space> F R EH N CH <space> AH N Y AH N <space> S EH L ER <space> D IH S AY D IH D <space> T UW <space> T R AY <space> HH IH Z <space> L AH K <space> AH N D <space> K R AO S <space> DH AH <eos>
Consecutive <space> found in phoneme: <sos> IH T <space> W AA Z <space> AE T <space> DH IH S <space> V EH R IY <space> S P AA T <space> IH N <space> W AH N <space> TH AW Z AH N D <space> <space> S IH K S <space> HH AH N D R AH D <space> AH N D <space> F IH F T IY <space> W AH N <space> DH AE T <space> CH AA R L Z <space> IY <space> IH S K EY P T <space> K AE P CH ER <space> B AY <eos>
Consecutive <space> found in phoneme: <sos> W IY <space> HH AE V <space> N AA T <space> HH AE D <space> AE N <space> AA R M IY <space> S IH N S <space> W AH N <space> TH AW Z AH N D <space> <space> N AY N <space> HH AH N D R AH D <space> AH N D <space> F AO R T IY <space> EY T <eos>
Consecutive <space> found in phoneme: <sos> S OW <space> DH EY <space> JH OY N D <space> F AO R S IH Z <space> W IH DH <space> DH AH <space> AO S T R IY AH N Z <space> AH N D <space> B AY <space> W AH N <space> TH AW Z AH N D <space> <space> S EH V AH N <space> HH AH N D R AH D <space> AH N D <space> S IH K S <eos>
Consecutive <space> found in phoneme: <sos> S UW N <space> AE F T ER <space> DH AH <space> N AA T S IY Z <space> K EY M <space> T UW <space> P AW ER <space> IH N <space> W AH N <space> TH AW Z AH N D <space> <space> N AY N <space> HH AH N D R AH D <space> AH N D <space> TH ER D IY <space> TH R IY <eos>
Consecutive <space> found in phoneme: <sos> AH N D <space> SH IY <space> HH AE D <space> DH IH S <space> AH F EH R <space> W IH DH <space> EH D W ER D <space> B IH T W IY N <space> W AH N <space> TH AW Z AH N D <space> <space> N AY N <space> HH AH N D R AH D <space> AH N D <space> EY T IY N <space> AH N D <space> W AH N <space> TH AW Z AH N D <space> <space> N AY N <space> HH AH N D R AH D <space> AH N D <space> T W EH N T IY <space> TH R IY <eos>
Consecutive <space> found in phoneme: <sos> B IH K AO Z <space> IH N <space> W AH N <space> TH AW Z AH N D <space> <space> N AY N <space> HH AH N D R AH D <space> AH N D <space> EY T IY <space> W AH N <space> W IY <space> W ER <space> R IH L IY <space> AE T <space> DH AH <space> T IH P IH NG <space> P OY N T <space> B IH T W IY N <space> DH AH <space> T ER B OW <eos>
Consecutive <space> found in phoneme: <sos> AY <space> S IY <space> IH T <space> W AA Z <space> R IH T AH N <space> IH N <space> JH UW N <space> W AH N <space> TH AW Z AH N D <space> <space> N AY N <space> HH AH N D R AH D <space> AH N D <space> S IH K S T IY <space> S EH V AH N <eos>
Consecutive <space> found in phoneme: <sos> IH T <space> IH Z <space> B EY S T <space> AA N <space> HH IH Z <space> S EH L F <space> P AO R T R AH T <space> AH V <space> W AH N <space> TH AW Z AH N D <space> <space> S EH V AH N <space> HH AH N D R AH D <space> AH N D <space> F AO R T IY <space> F AY V <eos>
Consecutive <space> found in phoneme: <sos> DH IH S <space> S AY T <space> W AA Z <space> AE N <space> R AE F <space> EH R <space> B EY S <space> F R AH M <space> W AH N <space> TH AW Z AH N D <space> <space> N AY N <space> HH AH N D R AH D <space> AH N D <space> EY T IY N <space> AH N T IH L <space> W AH N <space> TH AW Z AH N D <space> <space> N AY N <space> HH AH N D R AH D <space> AH N D <space> S IH K S T IY <space> W AH N <space> AH N D <space> N AW <eos>
Consecutive <space> found in phoneme: <sos> IH N <space> DH AH <space> S AH M ER <space> AH V <space> W AH N <space> TH AW Z AH N D <space> <space> N AY N <space> HH AH N D R AH D <space> AH N D <space> F AO R T IY <eos>
Consecutive <space> found in phoneme: <sos> AE T <space> DH AH <space> B AE T AH L <space> AH V <space> K W IH B ER OW N <space> B EY <space> IH N <space> W AH N <space> TH AW Z AH N D <space> <space> S EH V AH N <space> HH AH N D R AH D <space> AH N D <space> F IH F T IY <space> N AY N <eos>
Consecutive <space> found in phoneme: <sos> IH N <space> DH AH <space> S AH M ER <space> AH V <space> W AH N <space> TH AW Z AH N D <space> <space> N AY N <space> HH AH N D R AH D <space> AH N D <space> F AO R T IY <eos>
Consecutive <space> found in phoneme: <sos> HH AE V IH NG <space> B IH N <space> K AH M P L IY T AH D <space> IH N <space> W AH N <space> TH AW Z AH N D <space> <space> N AY N <space> HH AH N D R AH D <space> AH N D <space> S IH K S T IY <eos>
Consecutive <space> found in phoneme: <sos> W IY <space> S AH D AH N L IY <space> EH M B AA R K T <space> AA N <space> AH <space> HH EH D L AO NG <space> R AH SH <space> T UW <space> G EH T <space> R IH D <space> AH V <space> S T IY M <space> F R AH M <space> W AH N <space> TH AW Z AH N D <space> <space> N AY N <space> HH AH N D R AH D <space> AH N D <space> S IH K S T IY <space> TH R IY <space> T UW <eos>
Consecutive <space> found in phoneme: <sos> R AY T <space> AH P <space> T UW <space> W AH N <space> TH AW Z AH N D <space> <space> EY T <space> HH AH N D R AH D <space> AH N D <space> T W EH N T IY <space> S IH K S <eos>
Consecutive <space> found in phoneme: <sos> HH IY <space> P AE T AH N T AH D <space> DH AH <space> S AH L IH N D R IH K AH L <space> S L AY D <space> R UW L <space> IH N <space> W AH N <space> TH AW Z AH N D <space> <space> EY T <space> HH AH N D R AH D <space> AH N D <space> S EH V AH N T IY <space> EY T <eos>
Consecutive <space> found in phoneme: <sos> B IH K AO Z <space> HH IY <space> K EY M <space> T UW <space> P AW ER <space> IH N <space> AH <space> M IH L AH T EH R IY <space> K UW <space> IH N <space> W AH N <space> TH AW Z AH N D <space> <space> N AY N <space> HH AH N D R AH D <space> AH N D <space> S EH V AH N T IY <space> S EH V AH N <eos>
Consecutive <space> found in phoneme: <sos> DH AH <space> L AE S T <space> AA B Z ER V EY SH AH N <space> T UW <space> B IY <space> D AH N <space> HH IY R <space> W AA Z <space> M EY D <space> IH N <space> W AH N <space> TH AW Z AH N D <space> <space> N AY N <space> HH AH N D R AH D <space> AH N D <space> F IH F T IY <space> F AO R <eos>
Consecutive <space> found in phoneme: <sos> IH N <space> D IH S EH M B ER <space> W AH N <space> TH AW Z AH N D <space> <space> N AY N <space> HH AH N D R AH D <space> AH N D <space> F IH F T IY <space> W AH N <eos>
Consecutive <space> found in phoneme: <sos> S IH N S <space> K AA M IH K <space> R IH L IY F <space> S T AA R T AH D <space> IH N <space> W AH N <space> TH AW Z AH N D <space> <space> N AY N <space> HH AH N D R AH D <space> AH N D <space> EY T IY <space> F AY V <eos>
Consecutive <space> found in phoneme: <sos> HH UW <space> W AA Z <space> B AO R N <space> IH N <space> W AH N <space> TH AW Z AH N D <space> <space> N AY N <space> HH AH N D R AH D <space> AH N D <space> IH L EH V AH N <eos>
Consecutive <space> found in phoneme: <sos> AH N D <space> AE Z <space> AH <space> CH AY L D <space> IH N <space> W AH N <space> TH AW Z AH N D <space> <space> N AY N <space> HH AH N D R AH D <space> AH N D <space> S IH K S T IY <space> S EH V AH N <eos>
Consecutive <space> found in phoneme: <sos> HH IY <space> W AA Z <space> W ER K IH NG <space> AW T <space> IH N <space> IY S T <space> AE F R AH K AH <space> IH N <space> W AH N <space> TH AW Z AH N D <space> <space> N AY N <space> HH AH N D R AH D <space> AH N D <space> S EH V AH N T IY <space> TH R IY <space> W EH N <space> HH IY <space> K EY M <space> AH K R AO S <space> AH <space> F AH S IH L AH S T <space> B OW N <space> DH AE T <eos>
Consecutive <space> found in phoneme: <sos> IH T <space> S T AA R T AH D <space> IH N <space> W AH N <space> TH AW Z AH N D <space> <space> N AY N <space> HH AH N D R AH D <space> AH N D <space> N AY N T IY <space> F AO R <eos>
Consecutive <space> found in phoneme: <sos> JH AH S T <space> AH N AH DH ER <space> T UW <space> TH AW Z AH N D <space> <space> TH R IY <space> HH AH N D R AH D <space> T UW <space> G OW <eos>
Consecutive <space> found in phoneme: <sos> DH IH S <space> B UH K <space> W AA Z <space> P AH B L IH SH T <space> IH N <space> W AH N <space> TH AW Z AH N D <space> <space> EY T <space> HH AH N D R AH D <space> AH N D <space> F IH F T IY <space> T UW <eos>
Consecutive <space> found in phoneme: <sos> IH N T R AH D UW S T <space> IH N T UW <space> S ER V AH S <space> IH N <space> W AH N <space> TH AW Z AH N D <space> <space> N AY N <space> HH AH N D R AH D <space> AH N D <space> S IH K S T IY <space> N AY N <eos>
Consecutive <space> found in phoneme: <sos> R AY T <space> AH P <space> AH N T IH L <space> W AH N <space> TH AW Z AH N D <space> <space> N AY N <space> HH AH N D R AH D <space> AH N D <space> N AY N T IY <space> EY T <eos>
Consecutive <space> found in phoneme: <sos> DH AE T <space> W AA Z <space> IH N <space> W AH N <space> TH AW Z AH N D <space> <space> N AY N <space> HH AH N D R AH D <space> AH N D <space> S EH V AH N T IY <space> TH R IY <eos>
Consecutive <space> found in phoneme: <sos> AH N D <space> DH AE T <space> T UH K <space> P L EY S <space> IH N <space> W AH N <space> TH AW Z AH N D <space> <space> EY T <space> HH AH N D R AH D <space> AH N D <space> TH ER D IY <space> F AY V <eos>
Consecutive <space> found in phoneme: <sos> W EH N <space> IH T <space> OW P AH N D <space> IH N <space> W AH N <space> TH AW Z AH N D <space> <space> EY T <space> HH AH N D R AH D <space> AH N D <space> T W EH N T IY <space> S IH K S <eos>
Consecutive <space> found in phoneme: <sos> W IY <space> G AA T <space> IH T <space> IH N <space> W AH N <space> TH AW Z AH N D <space> <space> EY T <space> HH AH N D R AH D <space> AH N D <space> F AO R T IY <space> EY T <eos>
Consecutive <space> found in phoneme: <sos> DH IY Z <space> W ER <space> N OW N <space> AE Z <space> DH AH <space> AO S T EH R IH T IY <space> G EY M Z <space> IH N <space> W AH N <space> TH AW Z AH N D <space> <space> N AY N <space> HH AH N D R AH D <space> AH N D <space> F AO R T IY <space> EY T <eos>
Consecutive <space> found in phoneme: <sos> DH IY Z <space> W ER <space> N OW N <space> AE Z <space> DH AH <space> AO S T EH R IH T IY <space> G EY M Z <space> IH N <space> W AH N <space> TH AW Z AH N D <space> <space> N AY N <space> HH AH N D R AH D <space> AH N D <space> F AO R T IY <space> EY T <eos>
Consecutive <space> found in phoneme: <sos> S OW <space> AY <space> R EH K AH N <space> AH B AW T <space> W AH N <space> TH AW Z AH N D <space> <space> S EH V AH N <space> HH AH N D R AH D <space> AH N D <space> N AY N T IY <eos>
Consecutive <space> found in phoneme: <sos> P AH B L IH SH T <space> IH N <space> W AH N <space> TH AW Z AH N D <space> <space> N AY N <space> HH AH N D R AH D <space> AH N D <space> EY T IY <space> F AO R <eos>
Consecutive <space> found in phoneme: <sos> AY <space> TH IH NG K <space> IH N <space> JH ER M AH N IY <space> IH T <space> S T AA R T AH D <space> W IH DH <space> DH AH <space> R EH V AH L UW SH AH N <space> IH N <space> W AH N <space> TH AW Z AH N D <space> <space> EY T <space> HH AH N D R AH D <space> AH N D <space> F AO R T IY <space> EY T <eos>
Consecutive <space> found in phoneme: <sos> AH N D <space> DH AE T <space> W AA Z <space> AH B AW T <space> W AH N <space> TH AW Z AH N D <space> <space> EY T <space> HH AH N D R AH D <space> AH N D <space> F AO R T IY <space> T UW <eos>

def count_consecutive_special_tokens(sentence_sequences, phoneme_sequences):
    special_tokens = ['<eos>', '<sos>', '<space>']
    count = 0
    for seq in sentence_sequences:
        for token in special_tokens:
            if f"{token} {token}" in seq:
                count += 1
    for seq in phoneme_sequences:
        for token in special_tokens:
            if f"{token} {token}" in ' '.join(seq):
                count += 1
    return count

# Example usage:
count = count_consecutive_special_tokens(df['sentence_with_tokens'], df['phonemes'])
print(f"Number of sentences with consecutive special tokens: {count}")

Number of sentences with consecutive special tokens: 114

import matplotlib.pyplot as plt
import seaborn as sns

# Count consecutive special tokens
count = count_consecutive_special_tokens(df['sentence_with_tokens'], df['phonemes'])
# Data for visualization
labels = ['Sentences with Consecutive Tokens', 'Total Sentences']
values = [count, len(df) - count]
percentages = [value / len(df) * 100 for value in values]
# Colors for the visualizations
colors = ['#3498DB', '#E74C3C']
# Visualization
plt.figure(figsize=(10, 6))
sns.set_context("talk", font_scale=0.8)
bars = sns.barplot(x=labels, y=values, palette=colors)
# Annotate the bars with the count value and percentage
for index, (value, percentage) in enumerate(zip(values, percentages)):
    plt.text(index, value + (0.02 * max(values)),
             f"{value} ({percentage:.1f}%)",
             ha='center', va='center', fontweight='bold', fontsize=14)
# Set title and labels
plt.title('Sentences with Consecutive Special Tokens vs. Total Sentences', fontsize=15)
plt.ylabel('Number of Sentences', fontsize=13)
plt.xticks(fontsize=12)
# Ensure the text fits within the figure bounds
plt.tight_layout()
# Show the plot
plt.show()

df.head()

def has_consecutive_special_tokens(seq):
    special_tokens = ['<eos>', '<sos>', '<space>']
    for token in special_tokens:
        if f"{token} {token}" in seq:
            return True
    return False

# Create a mask that is True for rows without consecutive special tokens
mask = ~df['sentence_with_tokens'].apply(has_consecutive_special_tokens) & ~df['phonemes'].apply(lambda x: has_consecutive_special_tokens(' '.join(x)))

# Index df with the mask
df = df[mask]

print(df)

                                                sentence  word_count  \
0                     when you are cooking chips at home           7   
1      the traditional chip pan often stays on the shelf           9   
2                   through what they call a knife block           7   
3             which involves firing a potato down a pipe           8   
4      apart from the golden colour and the delicious...           9   
...                                                  ...         ...   
45834  when he is not having his seizures when he is ...          13   
45835            she wants attention from both of us and           8   
45836                as much as we try to give it to her          10   
45837                                    they so deserve           3   
45838            thank you enough for what you have done           8   

                                       original_sentence  \
0                      WHEN YOU'RE COOKING CHIPS AT HOME   
1      THE TRADITIONAL CHIP PAN OFTEN STAYS ON THE SHELF   
2                   THROUGH WHAT THEY CALL A KNIFE BLOCK   
3             WHICH INVOLVES FIRING A POTATO DOWN A PIPE   
4      APART FROM THE GOLDEN COLOUR AND THE DELICIOUS...   
...                                                  ...   
45834  WHEN HE'S NOT HAVING HIS SEIZURES WHEN HE'S NO...   
45835            SHE WANTS ATTENTION FROM BOTH OF US AND   
45836                AS MUCH AS WE TRY TO GIVE IT TO HER   
45837                                    THEY SO DESERVE   
45838              THANK YOU ENOUGH FOR WHAT YOU'VE DONE   

       word_count_after_conversion  changed  \
0                                6    False   
1                                9    False   
2                                7    False   
3                                8    False   
4                                9    False   
...                            ...      ...   
45834                           11    False   
45835                            8    False   
45836                           10    False   
45837                            3    False   
45838                            7    False   

                                                phonemes  num_spaces  \
0      [<sos>, W, EH, N, <space>, Y, UW, <space>, AA,...           6   
1      [<sos>, DH, AH, <space>, T, R, AH, D, IH, SH, ...           8   
2      [<sos>, TH, R, UW, <space>, W, AH, T, <space>,...           6   
3      [<sos>, W, IH, CH, <space>, IH, N, V, AA, L, V...           7   
4      [<sos>, AH, P, AA, R, T, <space>, F, R, AH, M,...           8   
...                                                  ...         ...   
45834  [<sos>, W, EH, N, <space>, HH, IY, <space>, IH...          12   
45835  [<sos>, SH, IY, <space>, W, AA, N, T, S, <spac...           7   
45836  [<sos>, AE, Z, <space>, M, AH, CH, <space>, AE...           9   
45837  [<sos>, DH, EY, <space>, S, OW, <space>, D, IH...           2   
45838  [<sos>, TH, AE, NG, K, <space>, Y, UW, <space>...           7   

                                            phonemes_str  \
0      <sos> W EH N <space> Y UW <space> AA R <space>...   
1      <sos> DH AH <space> T R AH D IH SH AH N AH L <...   
2      <sos> TH R UW <space> W AH T <space> DH EY <sp...   
3      <sos> W IH CH <space> IH N V AA L V Z <space> ...   
4      <sos> AH P AA R T <space> F R AH M <space> DH ...   
...                                                  ...   
45834  <sos> W EH N <space> HH IY <space> IH Z <space...   
45835  <sos> SH IY <space> W AA N T S <space> AH T EH...   
45836  <sos> AE Z <space> M AH CH <space> AE Z <space...   
45837  <sos> DH EY <space> S OW <space> D IH Z ER V <...   
45838  <sos> TH AE NG K <space> Y UW <space> IH N AH ...   

                                    sentence_with_tokens  phoneme_count  \
0      <sos> when <space> you <space> are <space> coo...             29   
1      <sos> the <space> traditional <space> chip <sp...             44   
2      <sos> through <space> what <space> they <space...             27   
3      <sos> which <space> involves <space> firing <s...             38   
4      <sos> apart <space> from <space> the <space> g...             49   
...                                                  ...            ...   
45834  <sos> when <space> he <space> is <space> not <...             54   
45835  <sos> she <space> wants <space> attention <spa...             37   
45836  <sos> as <space> much <space> as <space> we <s...             34   
45837        <sos> they <space> so <space> deserve <eos>             13   
45838  <sos> thank <space> you <space> enough <space>...             33   

                                      tokenized_sentence  
0      <sos> when <space> you <space> are <space> coo...  
1      <sos> the <space> traditional <space> chip <sp...  
2      <sos> through <space> what <space> they <space...  
3      <sos> which <space> involves <space> firing <s...  
4      <sos> apart <space> from <space> the <space> g...  
...                                                  ...  
45834  <sos> when <space> he <space> is <space> not <...  
45835  <sos> she <space> wants <space> attention <spa...  
45836  <sos> as <space> much <space> as <space> we <s...  
45837        <sos> they <space> so <space> deserve <eos>  
45838  <sos> thank <space> you <space> enough <space>...  

[45700 rows x 11 columns]

space_sentences = df[df['phonemes'].apply(lambda x: ' ' in x)]
print(space_sentences[['sentence', 'phonemes']])

Empty DataFrame
Columns: [sentence, phonemes]
Index: []

import sys
sys.path.append('/content/drive/MyDrive/Dissertation')
from label_vectorization import SentenceVectorizer

# Get the 10 most common phonemes
most_common_phonemes = phoneme_freq.most_common(10)
# Print the 10 most common phonemes
print("10 Most Common Phonemes:")
for phoneme, count in most_common_phonemes:
    print(f"{phoneme}: {count}")
# Set up the visualization with a refined style and context
sns.set_style("whitegrid")
sns.set_context("talk")
plt.figure(figsize=(15, 8))
# Extract phoneme names and their counts
phonemes = [phoneme for phoneme, _ in most_common_phonemes]
counts = [count for _, count in most_common_phonemes]
# Use a sophisticated color palette (deep muted colors)
palette = sns.color_palette("viridis", n_colors=len(most_common_phonemes))
# Plot the phoneme frequencies
bars = sns.barplot(x=phonemes, y=counts, palette=palette)
# Add annotations to each bar
for index, value in enumerate(counts):
    bars.text(index, value + max(counts)*0.02, f'{value} ({value/sum(counts)*100:.1f}%)', color='black', ha="center", va="bottom", fontsize=12)
# Set title, xlabel, ylabel and adjust font sizes
plt.title('Top 10 Phoneme Frequencies', fontsize=22, fontweight='bold', pad=20)
plt.xlabel('Phoneme', fontsize=18)
plt.ylabel('Frequency', fontsize=18)
plt.xticks(fontsize=16)
plt.yticks(fontsize=16)
# Ensure the plot layout is organized
plt.tight_layout()
# Show the plot
plt.show()

10 Most Common Phonemes:
<space>: 299529
AH: 111029
T: 91599
N: 77726
IH: 75183
R: 52083
S: 50329
D: 47510
<sos>: 45814
<eos>: 45814

# Concatenate all lists of phonemes and create a Counter object
all_phonemes = [phoneme for sublist in df['phonemes'] for phoneme in sublist]
phoneme_freq = Counter(all_phonemes)

# Get all unique phonemes
unique_phonemes = list(phoneme_freq.keys())
unique_phonemes

['<sos>',
 'W',
 'EH',
 'N',
 '<space>',
 'Y',
 'UW',
 'AA',
 'R',
 'K',
 'UH',
 'IH',
 'NG',
 'CH',
 'P',
 'S',
 'AE',
 'T',
 'HH',
 'OW',
 'M',
 '<eos>',
 'DH',
 'AH',
 'D',
 'SH',
 'L',
 'AO',
 'F',
 'EY',
 'Z',
 'TH',
 'AY',
 'B',
 'V',
 'AW',
 'G',
 'ER',
 'IY',
 'JH',
 'OY',
 'ZH']

# Define viseme categories
viseme_dict = {
    'aa': ['aa', 'aw', 'ay'], 'ah': ['ah'], 'ao': ['ao', 'oy', 'ow'],
    'ch': ['jh', 'ch', 'sh', 'zh'], 'er': ['er'], 'ey': ['eh', 'ey', 'ae'],
    'f': ['f', 'v'], 'iy': ['ih', 'iy'], 'k': ['k', 'g', 'ng', 'n'],
    'p': ['p', 'b', 'm'], 't': ['t', 'd', 's', 'z', 'th', 'dh'],
    'uh': ['uh', 'uw'], 'w': ['w', 'r', 'l', 'y', 'hh'],
    'space': ['<space>'], 'sos': ['<sos>'], 'eos': ['<eos>']
}

phoneme_to_viseme = {phoneme: viseme for viseme, phonemes in viseme_dict.items() for phoneme in phonemes}

def phonemes_to_visemes(phonemes):
    visemes = []
    for phoneme in phonemes:
        if phoneme in ['<sos>', '<eos>', '<space>']:
            visemes.append(phoneme)
        else:
            phoneme = phoneme[:-1] if phoneme[-1].isdigit() else phoneme
            viseme = phoneme_to_viseme.get(phoneme, 'unknown')
            visemes.append(viseme)
    return visemes


# Example DataFrame
df_check = pd.DataFrame({
    'phonemes': [['<sos>', 'W', 'EH', 'N', '<space>', 'Y', 'UW', 'K', 'UH', 'IH', 'NG', 'CH', 'P', 'S', 'AE', 'T', 'HH', 'OW', 'M', '<eos>', 'DH', 'AH', 'R', 'D', 'SH', 'L', 'AO', 'F', 'EY', 'Z', 'AA', 'TH', 'AY', 'B', 'V', 'AW', 'G', 'ER', 'IY', 'JH', 'OY', 'ZH']]
})

# Convert phonemes to lowercase
df['phonemes'] = df['phonemes'].apply(lambda phonemes: [phoneme.lower() for phoneme in phonemes])

# Convert phonemes to visemes in df_expanded
df['visemes'] = df['phonemes'].apply(phonemes_to_visemes)

# Print the first few rows to check the results
print(df[['phonemes', 'visemes']].head())

# Visual Inspection
print(df[['phonemes', 'visemes']].sample(5))

# Mapping Consistency
phoneme_to_viseme = {}
inconsistencies = 0
for phonemes, visemes in zip(df['phonemes'], df['visemes']):
    for phoneme, viseme in zip(phonemes, visemes):
        phoneme = phoneme[:-1] if phoneme[-1].isdigit() else phoneme
        if phoneme in phoneme_to_viseme:
            if phoneme_to_viseme[phoneme] != viseme:
                inconsistencies += 1
        else:
            phoneme_to_viseme[phoneme] = viseme

print(f'Number of inconsistencies in mapping: {inconsistencies}')

# Usage of Unknown Visemes
unknown_visemes_count = df['visemes'].apply(lambda x: x.count('unknown')).sum()
print(f'Number of unknown visemes: {unknown_visemes_count}')

                                            phonemes  \
0  [<sos>, w, eh, n, <space>, y, uw, <space>, aa,...   
1  [<sos>, dh, ah, <space>, t, r, ah, d, ih, sh, ...   
2  [<sos>, th, r, uw, <space>, w, ah, t, <space>,...   
3  [<sos>, w, ih, ch, <space>, ih, n, v, aa, l, v...   
4  [<sos>, ah, p, aa, r, t, <space>, f, r, ah, m,...   

                                             visemes  
0  [<sos>, w, ey, k, <space>, w, uh, <space>, aa,...  
1  [<sos>, t, ah, <space>, t, w, ah, t, iy, ch, a...  
2  [<sos>, t, w, uh, <space>, w, ah, t, <space>, ...  
3  [<sos>, w, iy, ch, <space>, iy, k, f, aa, w, f...  
4  [<sos>, ah, p, aa, w, t, <space>, f, w, ah, p,...  
                                                phonemes  \
16950  [<sos>, ih, z, <space>, n, aa, t, <space>, dh,...   
9165   [<sos>, ae, z, <space>, dh, ah, <space>, d, ae...   
17444  [<sos>, ih, t, <space>, w, aa, z, <space>, hh,...   
1656   [<sos>, y, uw, <space>, hh, ae, v, <space>, g,...   
40670  [<sos>, w, iy, <space>, aa, r, <space>, ae, s,...   

                                                 visemes  
16950  [<sos>, iy, t, <space>, k, aa, t, <space>, t, ...  
9165   [<sos>, ey, t, <space>, t, ah, <space>, t, ey,...  
17444  [<sos>, iy, t, <space>, w, aa, t, <space>, w, ...  
1656   [<sos>, w, uh, <space>, w, ey, f, <space>, k, ...  
40670  [<sos>, w, iy, <space>, aa, w, <space>, ey, t,...  
Number of inconsistencies in mapping: 0
Number of unknown visemes: 0

# Set display options
pd.set_option('display.max_rows', 5)
pd.set_option('display.max_colwidth', None)

# Display the first 5 rows
display(df[['phonemes', 'visemes']].head())

# Calculate the distribution of visemes in the dataset
viseme_distribution = pd.Series([item for sublist in df['visemes'] for item in sublist]).value_counts()

# Set up the visualization parameters
sns.set_style("whitegrid")
sns.set_palette("coolwarm_r")
sns.set_context("talk")

# Calculate the percentage of each viseme in the dataset
viseme_percentage = (viseme_distribution / viseme_distribution.sum()) * 100

# Create a horizontal bar plot for the visemes
plt.figure(figsize=(14, 10))
ax = sns.barplot(y=viseme_distribution.index, x=viseme_distribution.values, orient="h", palette="viridis")

# Annotate each bar with the count and percentage of each viseme
for index, value in enumerate(viseme_distribution.values):
    ax.text(value, index,
            f'{value} ({viseme_percentage[index]:.1f}%)',
            color='black', ha="left", va="center", fontsize=10)

plt.title('Distribution of Visemes in the Dataset', fontsize=16, fontweight='bold')
plt.ylabel('Viseme', fontsize=14)
plt.xlabel('Count', fontsize=14)
plt.show()

# Extract unique phonemes and visemes from the dataframe
unique_phonemes = set([item for sublist in df['phonemes'] for item in sublist])
unique_visemes = set([item for sublist in df['visemes'] for item in sublist])
# Exclude the special tokens from the filtered list
exclude_tokens = ['<space>', '<sos>', '<eos>', 'space', 'sos', 'eos']
filtered_phonemes = [phoneme for phoneme in unique_phonemes if phoneme not in exclude_tokens]
filtered_visemes = [viseme for viseme in unique_visemes if viseme not in exclude_tokens]
# Efficiently indexing the confusion matrix
phoneme_index = {phoneme: idx for idx, phoneme in enumerate(filtered_phonemes)}
viseme_index = {viseme: idx for idx, viseme in enumerate(filtered_visemes)}
# Create a matrix for the filtered phonemes and visemes
confusion_matrix = np.zeros((len(filtered_phonemes), len(filtered_visemes)))
# Update the matrix based on the mappings in the dataset
for phonemes, visemes in zip(df['phonemes'], df['visemes']):
    for phoneme, viseme in zip(phonemes, visemes):
        if phoneme in phoneme_index and viseme in viseme_index:
            i = phoneme_index[phoneme]
            j = viseme_index[viseme]
            confusion_matrix[i][j] += 1
# Plot the heatmap
plt.figure(figsize=(14, 10))
ax = sns.heatmap(confusion_matrix, annot=True, fmt=".0f", cmap="Blues",
                 xticklabels=filtered_visemes, yticklabels=filtered_phonemes,
                 annot_kws={"size": 12})
plt.title("Phoneme to Viseme Mapping Heatmap", fontsize=18, fontweight='bold')
plt.xlabel("Viseme", fontsize=16)
plt.ylabel("Phoneme", fontsize=16)
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)
plt.tight_layout()
plt.show()

# Check length consistency between phonemes and visemes
length_consistency = df['phonemes'].str.len().equals(df['visemes'].str.len())
print(f'Length consistency: {length_consistency}')

# Calculate lengths
df['phoneme_length'] = df['phonemes'].apply(len)
df['viseme_length'] = df['visemes'].apply(len)

# Find mismatches
mismatches = df[df['phoneme_length'] != df['viseme_length']]

# Print the sentences, phonemes, and visemes for those rows
for _, row in mismatches.head().iterrows():
    print(f"Sentence: {row['expanded_sentence']}")
    print(f"Phonemes: {' '.join(row['phonemes'])}")
    print(f"Visemes: {' '.join(row['visemes'])}")
    print(f"Phoneme Length: {row['phoneme_length']}")
    print(f"Viseme Length: {row['viseme_length']}\n")

# Display a sample of sentences, phonemes, and visemes for comparison
sample_comparison = df[['sentence', 'phonemes', 'visemes']].sample(5)

for _, row in sample_comparison.iterrows():
    print(f"Sentence: {row['sentence']}")
    print(f"Phonemes: {' '.join(row['phonemes'])}")
    print(f"Visemes: {' '.join(row['visemes'])}\n")

Length consistency: True
Sentence: we made some big bales as well
Phonemes: <sos> w iy <space> m ey d <space> s ah m <space> b ih g <space> b ey l z <space> ae z <space> w eh l <eos>
Visemes: <sos> w iy <space> p ey t <space> t ah p <space> p iy k <space> p ey w t <space> ey t <space> w ey w <eos>

Sentence: they had places in london all through their lives too
Phonemes: <sos> dh ey <space> hh ae d <space> p l ey s ah z <space> ih n <space> l ah n d ah n <space> ao l <space> th r uw <space> dh eh r <space> l ih v z <space> t uw <eos>
Visemes: <sos> t ey <space> w ey t <space> p w ey t ah t <space> iy k <space> w ah k t ah k <space> ao w <space> t w uh <space> t ey w <space> w iy f t <space> t uh <eos>

Sentence: the metropolitan cathedral
Phonemes: <sos> dh ah <space> m eh t r ah p aa l ah t ah n <space> k ah th iy d r ah l <eos>
Visemes: <sos> t ah <space> p ey t w ah p aa w ah t ah k <space> k ah t iy t w ah w <eos>

Sentence: it gave that hint of sexuality
Phonemes: <sos> ih t <space> g ey v <space> dh ae t <space> hh ih n t <space> ah v <space> s eh k sh uw ae l ah t iy <eos>
Visemes: <sos> iy t <space> k ey f <space> t ey t <space> w iy k t <space> ah f <space> t ey k ch uh ey w ah t iy <eos>

Sentence: we have our final two to play
Phonemes: <sos> w iy <space> hh ae v <space> aw er <space> f ay n ah l <space> t uw <space> t uw <space> p l ey <eos>
Visemes: <sos> w iy <space> w ey f <space> aa er <space> f aa k ah w <space> t uh <space> t uh <space> p w ey <eos>

# Display Sample Comparisons
sample_df = df.sample(5)
for index, row in sample_df.iterrows():
    print(f"Sentence {index + 1}: {row['sentence']}")
    print(f"Phonemes: {' '.join(row['phonemes'])}")
    print(f"Visemes: {' '.join(row['visemes'])}\n")

Sentence 14447: and it goes really high and every night
Phonemes: <sos> ah n d <space> ih t <space> g ow z <space> r ih l iy <space> hh ay <space> ah n d <space> eh v er iy <space> n ay t <eos>
Visemes: <sos> ah k t <space> iy t <space> k ao t <space> w iy w iy <space> w aa <space> ah k t <space> ey f er iy <space> k aa t <eos>

Sentence 41803: she was very sensitive to the fact that monarchs could be replaced by this method
Phonemes: <sos> sh iy <space> w aa z <space> v eh r iy <space> s eh n s ah t ih v <space> t uw <space> dh ah <space> f ae k t <space> dh ae t <space> m aa n aa r k s <space> k uh d <space> b iy <space> r iy p l ey s t <space> b ay <space> dh ih s <space> m eh th ah d <eos>
Visemes: <sos> ch iy <space> w aa t <space> f ey w iy <space> t ey k t ah t iy f <space> t uh <space> t ah <space> f ey k t <space> t ey t <space> p aa k aa w k t <space> k uh t <space> p iy <space> w iy p w ey t t <space> p aa <space> t iy t <space> p ey t ah t <eos>

Sentence 35827: i do not belong to any club
Phonemes: <sos> ay <space> d uw <space> n aa t <space> b ih l ao ng <space> t uw <space> eh n iy <space> k l ah b <eos>
Visemes: <sos> aa <space> t uh <space> k aa t <space> p iy w ao k <space> t uh <space> ey k iy <space> k w ah p <eos>

Sentence 45508: what can be done to help farmers like james
Phonemes: <sos> w ah t <space> k ae n <space> b iy <space> d ah n <space> t uw <space> hh eh l p <space> f aa r m er z <space> l ay k <space> jh ey m z <eos>
Visemes: <sos> w ah t <space> k ey k <space> p iy <space> t ah k <space> t uh <space> w ey w p <space> f aa w p er t <space> w aa k <space> ch ey p t <eos>

Sentence 24063: and dirac did not like to speak in french
Phonemes: <sos> ah n d <space> d ih r ah k <space> d ih d <space> n aa t <space> l ay k <space> t uw <space> s p iy k <space> ih n <space> f r eh n ch <eos>
Visemes: <sos> ah k t <space> t iy w ah k <space> t iy t <space> k aa t <space> w aa k <space> t uh <space> t p iy k <space> iy k <space> f w ey k ch <eos>

import os

# Store the original directory
original_directory = os.getcwd()

# Change to the directory where phonemes.txt is located
os.chdir('/content/drive/MyDrive/Dissertation/')

# Revert back to the original directory
os.chdir(original_directory)

print(df.columns)

Index(['sentence', 'word_count', 'original_sentence',
       'word_count_after_conversion', 'changed', 'phonemes', 'num_spaces',
       'phonemes_str', 'sentence_with_tokens', 'phoneme_count',
       'tokenized_sentence', 'visemes', 'phoneme_length', 'viseme_length'],
      dtype='object')

df.head()

df.shape

(45700, 14)

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Prepare the viseme data
viseme_tokenizer = Tokenizer(filters='', lower=False, split=' ')
viseme_tokenizer.fit_on_texts(df['visemes'])
viseme_sequences = viseme_tokenizer.texts_to_sequences(df['visemes'])
viseme_MAX_LEN = max(len(seq) for seq in viseme_sequences)
X_data = pad_sequences(viseme_sequences, maxlen=viseme_MAX_LEN, padding='post')

# Prepare the sentence data
sentence_tokenizer = Tokenizer(filters='!"#$%&()*+,-./:;=?@[\\]^_`{|}~\t\n')
sentence_tokenizer.fit_on_texts(df['sentence_with_tokens'])
sentence_sequences = sentence_tokenizer.texts_to_sequences(df['sentence_with_tokens'])
sentence_MAX_LEN = max(len(seq) for seq in sentence_sequences)
y_data = pad_sequences(sentence_sequences, maxlen=sentence_MAX_LEN, padding='post')

print("X_data:\n", X_data[:5])
print("\ny_data:\n", y_data[:5])

# Check if the special tokens <sos>, <space>, and <eos> are included in the tokenized sequences
special_tokens = ['<sos>', '<space>', '<eos>']
for token in special_tokens:
    token_index = viseme_tokenizer.word_index[token]
    token_in_X_data = any(token_index in seq for seq in X_data)
    token_in_y_data = any(token_index in seq for seq in y_data)
    print(f"\nIs '{token}' included in X_data? {token_in_X_data}")
    print(f"Is '{token}' included in y_data? {token_in_y_data}")

X_data:
 [[10  3  7  4  1  3 14  1  9  3  1  4 14  4  5  4  1 16  5  8  2  1  7  2
   1  3 13  8 11  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
   0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
   0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
   0  0  0  0  0  0  0  0  0  0  0  0  0]
 [10  2  6  1  2  3  6  2  5 16  6  4  6  3  1 16  5  8  1  8  7  4  1 13
  12  6  4  1  2  2  7  2  1  9  4  1  2  6  1 16  7  3 12 11  0  0  0  0
   0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
   0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
   0  0  0  0  0  0  0  0  0  0  0  0  0]
 [10  2  3 14  1  3  6  2  1  2  7  1  4 13  3  1  6  1  4  9 12  1  8  3
   9  4 11  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
   0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
   0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
   0  0  0  0  0  0  0  0  0  0  0  0  0]
 [10  3  5 16  1  5  4 12  9  3 12  2  1 12  9  3  5  4  1  6  1  8  6  2
   7  2 13  1  2  9  4  1  6  1  8  9  8 11  0  0  0  0  0  0  0  0  0  0
   0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
   0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
   0  0  0  0  0  0  0  0  0  0  0  0  0]
 [10  6  8  9  3  2  1 12  3  6  8  1  2  6  1  4 13  3  2  6  4  1  4  6
   3  9  3  1  6  4  2  1  2  6  1  2  5  3  5 16  6  2  1 12  3  7 12 15
  11  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
   0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
   0  0  0  0  0  0  0  0  0  0  0  0  0]]

y_data:
 [[   2   49    1   13    1   20    1  997    1 1629    1   38    1  145
     3    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0]
 [   2    4    1 1032    1 3014    1 4422    1  356    1 3334    1   22
     1    4    1 4423    3    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0]
 [   2  160    1   28    1   21    1  313    1    8    1 3015    1 2148
     3    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0]
 [   2   64    1 3817    1 3335    1    8    1 3818    1  115    1    8
     1 4424    3    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0]
 [   2  509    1   42    1    4    1 2036    1 1101    1    6    1    4
     1 2149    1 1809    3    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0]]

Is '<sos>' included in X_data? True
Is '<sos>' included in y_data? True

Is '<space>' included in X_data? True
Is '<space>' included in y_data? True

Is '<eos>' included in X_data? True
Is '<eos>' included in y_data? True

# Seaborn Plot
sns.set_style("whitegrid")
sns.set_context("talk")
palette = ["#3498db", "#e74c3c"]  # Blue and Red palette

# Boxplot Visualization
plt.figure(figsize=(16, 7))

# Boxplot for X_data (Viseme)
plt.subplot(1, 2, 1)
sns.boxplot(x=X_data.ravel(), color=palette[0])
plt.title('Boxplot of Encoded Values for Visemes', fontweight='bold')
plt.xlabel('Encoded Value', fontsize=14)
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)

# Boxplot for y_data (Sentences)
plt.subplot(1, 2, 2)
sns.boxplot(x=y_data.ravel(), color=palette[1])
plt.title('Boxplot of Encoded Values for Sentences', fontweight='bold')
plt.xlabel('Encoded Value', fontsize=14)
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)

plt.tight_layout()
plt.show()

print(df.columns)

Index(['sentence', 'word_count', 'original_sentence',
       'word_count_after_conversion', 'changed', 'phonemes', 'num_spaces',
       'phonemes_str', 'sentence_with_tokens', 'phoneme_count',
       'tokenized_sentence', 'visemes', 'phoneme_length', 'viseme_length'],
      dtype='object')

df.head()

# Check the structure of the df['visemes'] column
print("First 5 entries in 'visemes' column:")
print(df['visemes'].head())

# Check the structure of the df['tokenized_sentence'] column
print("\nFirst 5 entries in 'tokenized_sentence' column:")
print(df['sentence_with_tokens'].head())

# Check if the special tokens <sos>, <space>, and <eos> are already included in the data
special_tokens = ['<sos>', '<space>', '<eos>']
for token in special_tokens:
    token_in_visemes = df['visemes'].apply(lambda x: token in x).any()
    token_in_tokenized_sentence = df['sentence_with_tokens'].apply(lambda x: token in x).any()
    print(f"\nIs '{token}' included in 'visemes' column? {token_in_visemes}")
    print(f"Is '{token}' included in 'tokenized_sentence' column? {token_in_tokenized_sentence}")

First 5 entries in 'visemes' column:
0                                                                                   [<sos>, w, ey, k, <space>, w, uh, <space>, aa, w, <space>, k, uh, k, iy, k, <space>, ch, iy, p, t, <space>, ey, t, <space>, w, ao, p, <eos>]
1                   [<sos>, t, ah, <space>, t, w, ah, t, iy, ch, ah, k, ah, w, <space>, ch, iy, p, <space>, p, ey, k, <space>, ao, f, ah, k, <space>, t, t, ey, t, <space>, aa, k, <space>, t, ah, <space>, ch, ey, w, f, <eos>]
2                                                                                           [<sos>, t, w, uh, <space>, w, ah, t, <space>, t, ey, <space>, k, ao, w, <space>, ah, <space>, k, aa, f, <space>, p, w, aa, k, <eos>]
3                                              [<sos>, w, iy, ch, <space>, iy, k, f, aa, w, f, t, <space>, f, aa, w, iy, k, <space>, ah, <space>, p, ah, t, ey, t, ao, <space>, t, aa, k, <space>, ah, <space>, p, aa, p, <eos>]
4    [<sos>, ah, p, aa, w, t, <space>, f, w, ah, p, <space>, t, ah, <space>, k, ao, w, t, ah, k, <space>, k, ah, w, aa, w, <space>, ah, k, t, <space>, t, ah, <space>, t, iy, w, iy, ch, ah, t, <space>, f, w, ey, f, er, <eos>]
Name: visemes, dtype: object

First 5 entries in 'tokenized_sentence' column:
0                                        <sos> when <space> you <space> are <space> cooking <space> chips <space> at <space> home <eos>
1         <sos> the <space> traditional <space> chip <space> pan <space> often <space> stays <space> on <space> the <space> shelf <eos>
2                                      <sos> through <space> what <space> they <space> call <space> a <space> knife <space> block <eos>
3                        <sos> which <space> involves <space> firing <space> a <space> potato <space> down <space> a <space> pipe <eos>
4    <sos> apart <space> from <space> the <space> golden <space> colour <space> and <space> the <space> delicious <space> flavour <eos>
Name: sentence_with_tokens, dtype: object

Is '<sos>' included in 'visemes' column? True
Is '<sos>' included in 'tokenized_sentence' column? True

Is '<space>' included in 'visemes' column? True
Is '<space>' included in 'tokenized_sentence' column? True

Is '<eos>' included in 'visemes' column? True
Is '<eos>' included in 'tokenized_sentence' column? True

# Prepare the viseme and sentence data
viseme_tokenizer = Tokenizer(filters='', lower=False, split=' ')
viseme_tokenizer.fit_on_texts(df['visemes'])
viseme_sequences = viseme_tokenizer.texts_to_sequences(df['visemes'])
viseme_MAX_LEN = max(len(seq) for seq in viseme_sequences)
X_data = pad_sequences(viseme_sequences, maxlen=viseme_MAX_LEN, padding='post')

sentence_tokenizer = Tokenizer(filters='!"#$%&()*+,-./:;=?@[\\]^_`{|}~\t\n')
sentence_tokenizer.fit_on_texts(df['sentence_with_tokens'])
sentence_sequences = sentence_tokenizer.texts_to_sequences(df['sentence_with_tokens'])
sentence_MAX_LEN = max(len(seq) for seq in sentence_sequences)
y_data = pad_sequences(sentence_sequences, maxlen=sentence_MAX_LEN, padding='post')

# Calculate lengths for each sequence in viseme and sentence sequences
viseme_lengths = [len(seq) for seq in viseme_sequences]
sentence_lengths = [len(seq) for seq in sentence_sequences]

# Descriptive Statistics
print("=== Viseme Sequences ===")
print(f"Average Length: {np.mean(viseme_lengths)}")
print(f"Minimum Length: {np.min(viseme_lengths)}")
print(f"Maximum Length: {np.max(viseme_lengths)}")
print("\n")

print("=== Sentence Sequences ===")
print(f"Average Length: {np.mean(sentence_lengths)}")
print(f"Minimum Length: {np.min(sentence_lengths)}")
print(f"Maximum Length: {np.max(sentence_lengths)}")
print("\n")

# Token Frequency
viseme_freq = pd.Series([item for sublist in viseme_sequences for item in sublist]).value_counts()
sentence_freq = pd.Series([item for sublist in sentence_sequences for item in sublist]).value_counts()

print("=== Most Frequent Visemes ===")
print(viseme_freq.head(10))
print("\n")

print("=== Most Frequent Words ===")
print(sentence_freq.head(10))
print("\n")

# Special Tokens
for token in special_tokens:
    viseme_token_count = sum([seq.count(viseme_tokenizer.word_index[token]) for seq in viseme_sequences])
    sentence_token_count = sum([seq.count(sentence_tokenizer.word_index[token]) for seq in sentence_sequences])
    print(f"Occurrences of '{token}' in viseme sequences: {viseme_token_count}")
    print(f"Occurrences of '{token}' in sentence sequences: {sentence_token_count}")
    print("\n")

=== Viseme Sequences ===
Average Length: 34.041969365426695
Minimum Length: 11
Maximum Length: 109


=== Sentence Sequences ===
Average Length: 15.970196936542669
Minimum Length: 7
Maximum Length: 53


=== Most Frequent Visemes ===
1     297858
2     265845
       ...  
9      60570
10     45700
Length: 10, dtype: int64


=== Most Frequent Words ===
1     296369
2      45700
       ...  
9       7996
10      7692
Length: 10, dtype: int64


Occurrences of '<sos>' in viseme sequences: 45700
Occurrences of '<sos>' in sentence sequences: 45700


Occurrences of '<space>' in viseme sequences: 297858
Occurrences of '<space>' in sentence sequences: 296369


Occurrences of '<eos>' in viseme sequences: 45700
Occurrences of '<eos>' in sentence sequences: 45700

# Split data into train and test sets
X_train, X_test, y_train, y_test, train_indices, test_indices = train_test_split(
    X_data, y_data, range(len(X_data)), test_size=0.2, random_state=42
)


# Create TensorFlow Dataset objects
batch_size = 64
train_dataset = tf.data.Dataset.from_tensor_slices(((X_train, y_train), y_train)).batch(batch_size).prefetch(buffer_size=tf.data.AUTOTUNE)
test_dataset = tf.data.Dataset.from_tensor_slices(((X_test, y_test), y_test)).batch(batch_size).prefetch(buffer_size=tf.data.AUTOTUNE)

# Define the model
embedding_dim = 128
units = 256

# Encoder
encoder_inputs = Input(shape=(viseme_MAX_LEN,))
encoder_embedding_layer = Embedding(input_dim=len(viseme_tokenizer.word_index) + 1, output_dim=embedding_dim)
encoder_embedding = encoder_embedding_layer(encoder_inputs)
encoder_gru = GRU(units, return_sequences=True, return_state=True)
encoder_outputs, encoder_state = encoder_gru(encoder_embedding)

# Decoder
decoder_inputs = Input(shape=(sentence_MAX_LEN,))
decoder_embedding_layer = Embedding(input_dim=len(sentence_tokenizer.word_index) + 1, output_dim=embedding_dim)
decoder_embedding = decoder_embedding_layer(decoder_inputs)
decoder_gru = GRU(units, return_sequences=True)
decoder_outputs = decoder_gru(decoder_embedding, initial_state=encoder_state)

# Attention
attention = Attention()
context_vector = attention([decoder_outputs, encoder_outputs])

# Concatenate context vector and decoder output
decoder_combined = tf.concat([context_vector, decoder_outputs], axis=-1)

# Dense layer
decoder_dense = Dense(len(sentence_tokenizer.word_index) + 1, activation='softmax')
decoder_outputs = decoder_dense(decoder_combined)

# Model
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
model.compile(optimizer=Adam(), loss='sparse_categorical_crossentropy', metrics=['accuracy'])
# Define early stopping callback
early_stopping_callback = EarlyStopping(monitor='val_loss', patience=5)

# Train the model
history = model.fit(
    [X_train, y_train],
    y_train,
    batch_size=batch_size,
    epochs=5,
    validation_data=([X_test, y_test], y_test),
    callbacks=[early_stopping_callback]
)

# Evaluate the model on the test set
test_loss, test_acc = model.evaluate([X_test, y_test], y_test)
print("Test Loss:", test_loss)
print("Test Accuracy:", test_acc)

Epoch 1/5
572/572 [==============================] - 376s 651ms/step - loss: 1.1226 - accuracy: 0.8587 - val_loss: 0.5126 - val_accuracy: 0.9333
Epoch 2/5
572/572 [==============================] - 370s 647ms/step - loss: 0.3519 - accuracy: 0.9546 - val_loss: 0.2554 - val_accuracy: 0.9695
Epoch 3/5
572/572 [==============================] - 365s 638ms/step - loss: 0.1824 - accuracy: 0.9781 - val_loss: 0.1546 - val_accuracy: 0.9835
Epoch 4/5
572/572 [==============================] - 365s 637ms/step - loss: 0.1049 - accuracy: 0.9875 - val_loss: 0.1117 - val_accuracy: 0.9891
Epoch 5/5
572/572 [==============================] - 363s 635ms/step - loss: 0.0640 - accuracy: 0.9920 - val_loss: 0.0937 - val_accuracy: 0.9916
286/286 [==============================] - 49s 172ms/step - loss: 0.0937 - accuracy: 0.9916
Test Loss: 0.09373828768730164
Test Accuracy: 0.9915651679039001

# 1. Advanced Training and Validation Loss Curve
plt.figure(figsize=(12, 6))
plt.plot(history.history['loss'], label='Training Loss', color='blue', linestyle='--')
plt.plot(history.history['val_loss'], label='Validation Loss', color='red')
plt.scatter(np.argmin(history.history['val_loss']), min(history.history['val_loss']), s=100, c='red', marker='o')
plt.title('Advanced Training and Validation Loss over Epochs')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.grid(True, which='both', linestyle='--', linewidth=0.5)
plt.tight_layout()
plt.show()

# 2. Advanced Training and Validation Accuracy Curve
plt.figure(figsize=(12, 6))
plt.plot(history.history['accuracy'], label='Training Accuracy', color='blue', linestyle='--')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy', color='purple')
plt.scatter(np.argmax(history.history['val_accuracy']), max(history.history['val_accuracy']), s=100, c='purple', marker='o')
plt.title('Advanced Training and Validation Accuracy over Epochs')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()
plt.grid(True, which='both', linestyle='--', linewidth=0.5)
plt.tight_layout()
plt.show()

# 3. Model Architecture Visualization
plot_model(model, to_file='advanced_model_plot.png', show_shapes=True, show_layer_names=True, expand_nested=True)
plt.figure(figsize=(20, 20))
img = plt.imread('advanced_model_plot.png')
plt.imshow(img)
plt.axis('off')
plt.title('Advanced Model Architecture Visualization')
plt.show()

# 4. Advanced Final Test Loss and Accuracy
plt.figure(figsize=(10, 6))
bar_width = 0.35
index = np.arange(2)
bars1 = [test_loss, history.history['val_loss'][-1]]
bars2 = [test_acc, history.history['val_accuracy'][-1]]
rects1 = plt.bar(index, bars1, bar_width, label='Test', color='blue', alpha=0.8)
rects2 = plt.bar(index + bar_width, bars2, bar_width, label='Validation (Final Epoch)', color='green', alpha=0.8)
plt.xlabel('Metrics')
plt.ylabel('Values')
plt.title('Test Loss and Accuracy vs. Validation Metrics')
plt.xticks(index + bar_width / 2, ('Loss', 'Accuracy'))
plt.legend()
plt.tight_layout()
plt.show()

print(test_indices[:10])
print([df['visemes'].iloc[idx] for idx in test_indices[:10]])

[29468, 44953, 23062, 43578, 2182, 26391, 21035, 35684, 39308, 1911]
[['<sos>', 'w', 'iy', '<space>', 'k', 'iy', 't', '<space>', 'w', 'aa', 'w', 't', '<space>', 'f', 'ey', 'k', 't', 't', '<eos>'], ['<sos>', 't', 'ey', 't', '<space>', 'p', 'iy', 'k', 't', '<space>', 'f', 'ao', 'w', '<space>', 't', 'ah', '<space>', 'f', 'er', 't', 't', '<space>', 't', 'aa', 'p', '<space>', 't', 'iy', 'k', 't', '<space>', 'w', 'iy', '<space>', 'ch', 'ao', 'k', 't', '<space>', 't', 'ah', '<space>', 'w', 'uh', '<eos>'], ['<sos>', 'aa', '<space>', 't', 'iy', 'k', 'k', '<space>', 't', 'iy', 't', '<space>', 'iy', 't', '<space>', 'ah', '<space>', 'w', 'iy', 'w', 'iy', '<space>', 'k', 'uh', 't', '<space>', 'w', 'iy', 't', 'ah', 'w', '<space>', 'f', 'w', 'ey', 't', '<eos>'], ['<sos>', 'p', 'ey', 't', 't', '<space>', 'aa', 'k', '<space>', 't', 'ah', '<space>', 'f', 'ey', 'k', 't', '<space>', 'aa', '<space>', 'w', 'ey', 'f', '<space>', 'w', 'er', 'k', 't', '<space>', 'w', 'iy', 't', '<space>', 'w', 'w', 'uh', '<space>', 'f', 'ao', 'w', '<space>', 'w', 'ao', 'k', 'k', 'er', '<eos>'], ['<sos>', 't', 'ah', '<space>', 'ah', 't', 'er', 't', '<space>', 'ao', 'k', 'w', 'iy', '<space>', 'k', 'ey', 't', '<space>', 'k', 'ey', 'p', 'ah', 'k', '<eos>'], ['<sos>', 'iy', 't', '<space>', 'iy', 't', '<space>', 'k', 'aa', 't', '<space>', 'ao', 'f', 'ah', 'k', '<space>', 'aa', '<space>', 'w', 'uh', 't', '<space>', 'p', 'aa', '<space>', 't', 'ey', 'p', 'p', 'er', '<eos>'], ['<sos>', 't', 'iy', 't', '<space>', 'w', 'aa', 't', '<space>', 't', 'ah', '<space>', 't', 'aa', 'p', '<space>', 'iy', 'k', '<space>', 'w', 'iy', 'ch', '<eos>'], ['<sos>', 't', 'ah', '<space>', 'ch', 'ao', '<space>', 'w', 'ey', 'w', '<space>', 'ah', '<space>', 't', 'iy', 'p', '<space>', 'ah', 'f', '<space>', 'f', 'aa', 'f', '<space>', 'k', 'w', 'iy', 't', '<space>', 'ch', 'ey', 'w', 'ah', 'k', 'ch', 'er', 't', '<space>', 'p', 'iy', 't', '<space>', 't', 'ey', 'w', '<space>', 'w', 'iy', 't', 't', '<space>', 'ah', 'k', 'ey', 'k', 't', 't', '<space>', 'p', 'aa', 't', 'ah', 'p', 'w', 'iy', '<space>', 't', 'ah', '<space>', 'k', 'w', 'ey', 't', 'ah', 't', 't', '<space>', 'k', 'w', 'iy', 't', '<eos>'], ['<sos>', 'ah', 'k', 't', '<space>', 't', 'ao', '<space>', 'w', 'iy', '<space>', 'w', 'ey', 'f', '<space>', 't', 'w', 'aa', 't', '<space>', 't', 'uh', '<space>', 'p', 'iy', '<space>', 'k', 'uh', 't', '<space>', 'p', 'ey', 'k', 'er', 't', '<space>', 'p', 'aa', '<space>', 't', 'ey', 'iy', 'k', '<space>', 't', 'ey', 'k', '<eos>'], ['<sos>', 'w', 'aa', 'w', '<space>', 'ey', 'k', 'iy', '<space>', 'ey', 't', '<space>', 't', 't', 'iy', 'w', '<space>', 'ah', '<space>', 'p', 'ey', 'p', '<eos>']]

# Function to convert predicted token IDs to text
def sequences_to_texts(sequences, tokenizer):
    texts = tokenizer.sequences_to_texts(sequences)
    return texts

# Select a subset from the test data for evaluation
num_examples = 15
X_test_subset = X_test[:num_examples]
y_test_subset = y_test[:num_examples]

original_visemes = df['visemes'].tolist()

# Get the visemes for the selected test subset
test_visemes_subset = [original_visemes[idx] for idx in test_indices[:num_examples]]

# Generate predictions on the subset of the test set
predictions = model.predict([X_test_subset, y_test_subset])

# Convert predicted token IDs to text
predicted_sentences = sequences_to_texts(predictions.argmax(axis=-1), sentence_tokenizer)

# Convert original token IDs to text
original_sentences = sequences_to_texts(y_test_subset, sentence_tokenizer)

# Initialize WER, BLEU, and CER scores
wer_scores = []
bleu_scores = []

# Print the original sentences, predicted sentences, and visemes side by side
for original, predicted, viseme_seq in zip(original_sentences, predicted_sentences, test_visemes_subset):
    viseme_seq_text = ' '.join(viseme_seq)
    print(f"Original: {original}\nPredicted: {predicted}\nVisemes: {viseme_seq_text}")

    # Calculate WER
    wer = jiwer.wer(original, predicted)
    wer_scores.append(wer)
    print(f"WER: {wer:.4f}")

    # Tokenize sentences for BLEU score calculation
    original_tokens = original.split()
    predicted_tokens = predicted.split()

    # Calculate BLEU score
    bleu_score = corpus_bleu([[original_tokens]], [predicted_tokens], smoothing_function=SmoothingFunction().method3)
    bleu_scores.append(bleu_score)
    print(f"BLEU Score: {bleu_score:.4f}")

    print("-" * 50)
# Calculate average WER and BLEU scores
average_wer = sum(wer_scores) / len(wer_scores)
average_bleu = sum(bleu_scores) / len(bleu_scores)

print(f"Average WER: {average_wer:.4f}")
print(f"Average BLEU Score: {average_bleu:.4f}")

1/1 [==============================] - 1s 812ms/step
Original: <sos> we <space> need <space> hard <space> facts <eos>
Predicted: <sos> we <space> need <space> hard <space> facts <eos>
Visemes: <sos> w iy <space> k iy t <space> w aa w t <space> f ey k t t <eos>
WER: 0.0000
BLEU Score: 1.0000
--------------------------------------------------
Original: <sos> that <space> means <space> for <space> the <space> first <space> time <space> since <space> we <space> joined <space> the <space> eu <eos>
Predicted: <sos> that <space> means <space> for <space> the <space> first <space> time <space> since <space> we <space> joined <space> the <space> eu <eos>
Visemes: <sos> t ey t <space> p iy k t <space> f ao w <space> t ah <space> f er t t <space> t aa p <space> t iy k t <space> w iy <space> ch ao k t <space> t ah <space> w uh <eos>
WER: 0.0000
BLEU Score: 1.0000
--------------------------------------------------
Original: <sos> i <space> think <space> this <space> is <space> a <space> really <space> good <space> little <space> flat <eos>
Predicted: <sos> i <space> think <space> this <space> is <space> a <space> really <space> good <space> little <space> flat <eos>
Visemes: <sos> aa <space> t iy k k <space> t iy t <space> iy t <space> ah <space> w iy w iy <space> k uh t <space> w iy t ah w <space> f w ey t <eos>
WER: 0.0000
BLEU Score: 1.0000
--------------------------------------------------
Original: <sos> based <space> on <space> the <space> fact <space> i <space> have <space> worked <space> with <space> hugh <space> for <space> longer <eos>
Predicted: <sos> based <space> on <space> the <space> fact <space> i <space> have <space> worked <space> with <space> launch <space> for <space> longer <eos>
Visemes: <sos> p ey t t <space> aa k <space> t ah <space> f ey k t <space> aa <space> w ey f <space> w er k t <space> w iy t <space> w w uh <space> f ao w <space> w ao k k er <eos>
WER: 0.0435
BLEU Score: 0.8787
--------------------------------------------------
Original: <sos> the <space> others <space> only <space> get <space> gammon <eos>
Predicted: <sos> the <space> others <space> only <space> get <space> tim <eos>
Visemes: <sos> t ah <space> ah t er t <space> ao k w iy <space> k ey t <space> k ey p ah k <eos>
WER: 0.0909
BLEU Score: 0.8071
--------------------------------------------------
Original: <sos> it <space> is <space> not <space> often <space> i <space> lose <space> my <space> temper <eos>
Predicted: <sos> it <space> is <space> not <space> often <space> i <space> lose <space> my <space> nickname <eos>
Visemes: <sos> iy t <space> iy t <space> k aa t <space> ao f ah k <space> aa <space> w uh t <space> p aa <space> t ey p p er <eos>
WER: 0.0588
BLEU Score: 0.8844
--------------------------------------------------
Original: <sos> this <space> was <space> the <space> time <space> in <space> which <eos>
Predicted: <sos> this <space> was <space> the <space> time <space> in <space> which <eos>
Visemes: <sos> t iy t <space> w aa t <space> t ah <space> t aa p <space> iy k <space> w iy ch <eos>
WER: 0.0000
BLEU Score: 1.0000
--------------------------------------------------
Original: <sos> the <space> show <space> where <space> a <space> team <space> of <space> five <space> quiz <space> challengers <space> pit <space> their <space> wits <space> against <space> possibly <space> the <space> greatest <space> quiz <eos>
Predicted: <sos> the <space> show <space> where <space> a <space> team <space> of <space> five <space> quiz <space> challengers <space> pit <space> their <space> wits <space> against <space> possibly <space> the <space> greatest <space> quiz <eos>
Visemes: <sos> t ah <space> ch ao <space> w ey w <space> ah <space> t iy p <space> ah f <space> f aa f <space> k w iy t <space> ch ey w ah k ch er t <space> p iy t <space> t ey w <space> w iy t t <space> ah k ey k t t <space> p aa t ah p w iy <space> t ah <space> k w ey t ah t t <space> k w iy t <eos>
WER: 0.0000
BLEU Score: 1.0000
--------------------------------------------------
Original: <sos> and <space> so <space> we <space> have <space> tried <space> to <space> be <space> good <space> mannered <space> by <space> saying <space> ten <eos>
Predicted: <sos> and <space> so <space> we <space> have <space> tried <space> to <space> be <space> good <space> topical <space> by <space> saying <space> ten <eos>
Visemes: <sos> ah k t <space> t ao <space> w iy <space> w ey f <space> t w aa t <space> t uh <space> p iy <space> k uh t <space> p ey k er t <space> p aa <space> t ey iy k <space> t ey k <eos>
WER: 0.0400
BLEU Score: 0.8895
--------------------------------------------------
Original: <sos> while <space> annie's <space> still <space> a <space> babe <eos>
Predicted: <sos> while <space> tim <space> still <space> a <space> peake <eos>
Visemes: <sos> w aa w <space> ey k iy <space> ey t <space> t t iy w <space> ah <space> p ey p <eos>
WER: 0.1818
BLEU Score: 0.4833
--------------------------------------------------
Original: <sos> which <space> was <space> the <space> area <space> that <space> they <space> wanted <space> us <space> to <space> take <space> the <space> casualty <eos>
Predicted: <sos> which <space> was <space> the <space> area <space> that <space> they <space> wanted <space> us <space> to <space> take <space> the <space> depth <eos>
Visemes: <sos> w iy ch <space> w aa t <space> t ah <space> ey w iy ah <space> t ey t <space> t ey <space> w aa k t ah t <space> ah t <space> t uh <space> t ey k <space> t ah <space> k ey ch ah w ah w t iy <eos>
WER: 0.0400
BLEU Score: 0.9245
--------------------------------------------------
Original: <sos> who <space> do <space> i <space> see <space> about <space> a <space> death <space> certificate <eos>
Predicted: <sos> who <space> do <space> i <space> see <space> about <space> a <space> death <space> certificate <eos>
Visemes: <sos> w uh <space> t uh <space> aa <space> t iy <space> ah p aa t <space> ah <space> t ey t <space> t er t iy f iy k ah t <eos>
WER: 0.0000
BLEU Score: 1.0000
--------------------------------------------------
Original: <sos> reported <space> back <space> to <space> his <space> cabinet <eos>
Predicted: <sos> reported <space> back <space> to <space> his <space> cabinet <eos>
Visemes: <sos> w iy p ao w t ah t <space> p ey k <space> t uh <space> w iy t <space> k ey p ah k ah t <eos>
WER: 0.0000
BLEU Score: 1.0000
--------------------------------------------------
Original: <sos> you <space> want <space> to <space> say <space> to <space> people <space> round <space> the <space> dining <space> table <eos>
Predicted: <sos> you <space> want <space> to <space> say <space> to <space> people <space> round <space> the <space> dining <space> table <eos>
Visemes: <sos> w uh <space> w aa k t <space> t uh <space> t ey <space> t uh <space> p iy p ah w <space> w aa k t <space> t ah <space> t aa k iy k <space> t ey p ah w <eos>
WER: 0.0000
BLEU Score: 1.0000
--------------------------------------------------
Original: <sos> sold <space> in <space> aid <space> of <space> children <space> in <space> need <eos>
Predicted: <sos> sold <space> in <space> aid <space> of <space> children <space> in <space> need <eos>
Visemes: <sos> t ao w t <space> iy k <space> ey t <space> ah f <space> ch iy w t w ah k <space> iy k <space> k iy t <eos>
WER: 0.0000
BLEU Score: 1.0000
--------------------------------------------------
Average WER: 0.0303
Average BLEU Score: 0.9245

import matplotlib.pyplot as plt

# Create a list of set labels
sets = [f"Set {i+1}" for i in range(num_examples)]

# Create a figure and a set of subplots
fig, ax = plt.subplots(2, 1, figsize=(12, 10))

# Plot WER scores with markers
ax[0].plot(sets, wer_scores, color='steelblue', marker='o', label='WER per Set')
ax[0].axhline(average_wer, color='coral', linestyle='dashed', linewidth=1, label=f'Average WER: {average_wer:.4f}')
ax[0].set_title('Word Error Rate (WER) for Each Set')
ax[0].set_ylabel('WER')
ax[0].set_xticks(sets)
ax[0].set_xticklabels(sets, rotation=45)
ax[0].legend()

# Plot BLEU scores with markers
ax[1].plot(sets, bleu_scores, color='steelblue', marker='o', label='BLEU Score per Set')
ax[1].axhline(average_bleu, color='coral', linestyle='dashed', linewidth=1, label=f'Average BLEU: {average_bleu:.4f}')
ax[1].set_title('BLEU Score for Each Set')
ax[1].set_ylabel('BLEU Score')
ax[1].set_xticks(sets)
ax[1].set_xticklabels(sets, rotation=45)
ax[1].legend()

# Adjust the layout
plt.tight_layout()
plt.show()

	phonemes_str
0	<sos> W EH1 N <space> Y UW1 <space> AA1 R <space> K UH1 K IH0 NG <space> CH IH1 P S <space> AE1 T <space> HH OW1 M <eos>
1	<sos> DH AH0 <space> T R AH0 D IH1 SH AH0 N AH0 L <space> CH IH1 P <space> P AE1 N <space> AO1 F AH0 N <space> S T EY1 Z <space> AA1 N <space> DH AH0 <space> SH EH1 L F <eos>
2	<sos> TH R UW1 <space> W AH1 T <space> DH EY1 <space> K AO1 L <space> AH0 <space> N AY1 F <space> B L AA1 K <eos>
3	<sos> W IH1 CH <space> IH0 N V AA1 L V Z <space> F AY1 R IH0 NG <space> AH0 <space> P AH0 T EY1 T OW2 <space> D AW1 N <space> AH0 <space> P AY1 P <eos>
4	<sos> AH0 P AA1 R T <space> F R AH1 M <space> DH AH0 <space> G OW1 L D AH0 N <space> K AH1 L AW0 R <space> AH0 N D <space> DH AH0 <space> D IH0 L IH1 SH AH0 S <space> F L AE1 V ER0 <eos>

	tokenized_sentence	phonemes_str
17720	<sos> but <space> what <space> it <space> does <space> need <space> is <space> a <space> set <space> of <space> eyes <eos>	<sos> B AH T <space> W AH T <space> IH T <space> D AH Z <space> N IY D <space> IH Z <space> AH <space> S EH T <space> AH V <space> AY Z <eos>
26979	<sos> taken <space> very <space> seriously <eos>	<sos> T EY K AH N <space> V EH R IY <space> S IH R IY AH S L IY <eos>
39564	<sos> let <space> us <space> find <space> out <space> about <space> one <space> of <space> the <space> most <space> ancient <space> plants <space> on <space> the <space> planet <eos>	<sos> L EH T <space> AH S <space> F AY N D <space> AW T <space> AH B AW T <space> W AH N <space> AH V <space> DH AH <space> M OW S T <space> EY N CH AH N T <space> P L AE N T S <space> AA N <space> DH AH <space> P L AE N AH T <eos>
32297	<sos> he <space> might <space> also <space> have <space> been <space> quietly <space> beheaded <eos>	<sos> HH IY <space> M AY T <space> AO L S OW <space> HH AE V <space> B IH N <space> K W AY AH T L IY <space> B IH HH EH D IH D <eos>
27232	<sos> within <space> twenty <space> four <space> hours <eos>	<sos> W IH DH IH N <space> T W EH N T IY <space> F AO R <space> AW ER Z <eos>

	sentence
0	WHEN YOU'RE COOKING CHIPS AT HOME
1	THE TRADITIONAL CHIP PAN OFTEN STAYS ON THE SHELF
2	THROUGH WHAT THEY CALL A KNIFE BLOCK
3	WHICH INVOLVES FIRING A POTATO DOWN A PIPE
4	APART FROM THE GOLDEN COLOUR AND THE DELICIOUS...

	sentence	word_count	original_sentence	word_count_after_conversion	changed
0	when you're cooking chips at home	6	WHEN YOU'RE COOKING CHIPS AT HOME	6	False
1	the traditional chip pan often stays on the shelf	9	THE TRADITIONAL CHIP PAN OFTEN STAYS ON THE SHELF	9	False
2	through what they call a knife block	7	THROUGH WHAT THEY CALL A KNIFE BLOCK	7	False
3	which involves firing a potato down a pipe	8	WHICH INVOLVES FIRING A POTATO DOWN A PIPE	8	False
4	apart from the golden colour and the delicious...	9	APART FROM THE GOLDEN COLOUR AND THE DELICIOUS...	9	False

	sentence	word_count	original_sentence	word_count_after_conversion	changed	phonemes	num_spaces	phonemes_str
0	when you are cooking chips at home	7	WHEN YOU'RE COOKING CHIPS AT HOME	6	False	[<sos>, W, EH1, N, <space>, Y, UW1, <space>, A...	6	<sos> W EH1 N <space> Y UW1 <space> AA1 R <spa...
1	the traditional chip pan often stays on the shelf	9	THE TRADITIONAL CHIP PAN OFTEN STAYS ON THE SHELF	9	False	[<sos>, DH, AH0, <space>, T, R, AH0, D, IH1, S...	8	<sos> DH AH0 <space> T R AH0 D IH1 SH AH0 N AH...
2	through what they call a knife block	7	THROUGH WHAT THEY CALL A KNIFE BLOCK	7	False	[<sos>, TH, R, UW1, <space>, W, AH1, T, <space...	6	<sos> TH R UW1 <space> W AH1 T <space> DH EY1 ...
3	which involves firing a potato down a pipe	8	WHICH INVOLVES FIRING A POTATO DOWN A PIPE	8	False	[<sos>, W, IH1, CH, <space>, IH0, N, V, AA1, L...	7	<sos> W IH1 CH <space> IH0 N V AA1 L V Z <spac...
4	apart from the golden colour and the delicious...	9	APART FROM THE GOLDEN COLOUR AND THE DELICIOUS...	9	False	[<sos>, AH0, P, AA1, R, T, <space>, F, R, AH1,...	8	<sos> AH0 P AA1 R T <space> F R AH1 M <space> ...

	phonemes	visemes
0	[<sos>, w, eh, n, <space>, y, uw, <space>, aa, r, <space>, k, uh, k, ih, ng, <space>, ch, ih, p, s, <space>, ae, t, <space>, hh, ow, m, <eos>]	[<sos>, w, ey, k, <space>, w, uh, <space>, aa, w, <space>, k, uh, k, iy, k, <space>, ch, iy, p, t, <space>, ey, t, <space>, w, ao, p, <eos>]
1	[<sos>, dh, ah, <space>, t, r, ah, d, ih, sh, ah, n, ah, l, <space>, ch, ih, p, <space>, p, ae, n, <space>, ao, f, ah, n, <space>, s, t, ey, z, <space>, aa, n, <space>, dh, ah, <space>, sh, eh, l, f, <eos>]	[<sos>, t, ah, <space>, t, w, ah, t, iy, ch, ah, k, ah, w, <space>, ch, iy, p, <space>, p, ey, k, <space>, ao, f, ah, k, <space>, t, t, ey, t, <space>, aa, k, <space>, t, ah, <space>, ch, ey, w, f, <eos>]
2	[<sos>, th, r, uw, <space>, w, ah, t, <space>, dh, ey, <space>, k, ao, l, <space>, ah, <space>, n, ay, f, <space>, b, l, aa, k, <eos>]	[<sos>, t, w, uh, <space>, w, ah, t, <space>, t, ey, <space>, k, ao, w, <space>, ah, <space>, k, aa, f, <space>, p, w, aa, k, <eos>]
3	[<sos>, w, ih, ch, <space>, ih, n, v, aa, l, v, z, <space>, f, ay, r, ih, ng, <space>, ah, <space>, p, ah, t, ey, t, ow, <space>, d, aw, n, <space>, ah, <space>, p, ay, p, <eos>]	[<sos>, w, iy, ch, <space>, iy, k, f, aa, w, f, t, <space>, f, aa, w, iy, k, <space>, ah, <space>, p, ah, t, ey, t, ao, <space>, t, aa, k, <space>, ah, <space>, p, aa, p, <eos>]
4	[<sos>, ah, p, aa, r, t, <space>, f, r, ah, m, <space>, dh, ah, <space>, g, ow, l, d, ah, n, <space>, k, ah, l, aw, r, <space>, ah, n, d, <space>, dh, ah, <space>, d, ih, l, ih, sh, ah, s, <space>, f, l, ae, v, er, <eos>]	[<sos>, ah, p, aa, w, t, <space>, f, w, ah, p, <space>, t, ah, <space>, k, ao, w, t, ah, k, <space>, k, ah, w, aa, w, <space>, ah, k, t, <space>, t, ah, <space>, t, iy, w, iy, ch, ah, t, <space>, f, w, ey, f, er, <eos>]