1. Installing Libraries
In [ ]:
!pip install pyenchant contractions g2p-en cmudict jiwer
import nltk
!pip install -U nltk
nltk.download('averaged_perceptron_tagger')
nltk.download('punkt')
nltk.download('cmudict')
!pip install enchant
Requirement already satisfied: pyenchant in /usr/local/lib/python3.10/dist-packages (3.2.2) Requirement already satisfied: contractions in /usr/local/lib/python3.10/dist-packages (0.1.73) Requirement already satisfied: g2p-en in /usr/local/lib/python3.10/dist-packages (2.1.0) Requirement already satisfied: cmudict in /usr/local/lib/python3.10/dist-packages (1.0.13) Requirement already satisfied: jiwer in /usr/local/lib/python3.10/dist-packages (3.0.3) Requirement already satisfied: textsearch>=0.0.21 in /usr/local/lib/python3.10/dist-packages (from contractions) (0.0.24) Requirement already satisfied: numpy>=1.13.1 in /usr/local/lib/python3.10/dist-packages (from g2p-en) (1.23.5) Requirement already satisfied: nltk>=3.2.4 in /usr/local/lib/python3.10/dist-packages (from g2p-en) (3.8.1) Requirement already satisfied: inflect>=0.3.1 in /usr/local/lib/python3.10/dist-packages (from g2p-en) (7.0.0) Requirement already satisfied: distance>=0.1.3 in /usr/local/lib/python3.10/dist-packages (from g2p-en) (0.1.3) Requirement already satisfied: importlib-metadata<6.0.0,>=5.1.0 in /usr/local/lib/python3.10/dist-packages (from cmudict) (5.2.0) Requirement already satisfied: importlib-resources<6.0.0,>=5.10.1 in /usr/local/lib/python3.10/dist-packages (from cmudict) (5.13.0) Requirement already satisfied: click<9.0.0,>=8.1.3 in /usr/local/lib/python3.10/dist-packages (from jiwer) (8.1.7) Requirement already satisfied: rapidfuzz<4,>=3 in /usr/local/lib/python3.10/dist-packages (from jiwer) (3.3.0) Requirement already satisfied: zipp>=0.5 in /usr/local/lib/python3.10/dist-packages (from importlib-metadata<6.0.0,>=5.1.0->cmudict) (3.16.2) Requirement already satisfied: pydantic>=1.9.1 in /usr/local/lib/python3.10/dist-packages (from inflect>=0.3.1->g2p-en) (1.10.12) Requirement already satisfied: typing-extensions in /usr/local/lib/python3.10/dist-packages (from inflect>=0.3.1->g2p-en) (4.5.0) Requirement already satisfied: joblib in /usr/local/lib/python3.10/dist-packages (from nltk>=3.2.4->g2p-en) (1.3.2) Requirement already satisfied: regex>=2021.8.3 in /usr/local/lib/python3.10/dist-packages (from nltk>=3.2.4->g2p-en) (2023.6.3) Requirement already satisfied: tqdm in /usr/local/lib/python3.10/dist-packages (from nltk>=3.2.4->g2p-en) (4.66.1) Requirement already satisfied: anyascii in /usr/local/lib/python3.10/dist-packages (from textsearch>=0.0.21->contractions) (0.3.2) Requirement already satisfied: pyahocorasick in /usr/local/lib/python3.10/dist-packages (from textsearch>=0.0.21->contractions) (2.0.0) Requirement already satisfied: nltk in /usr/local/lib/python3.10/dist-packages (3.8.1) Requirement already satisfied: click in /usr/local/lib/python3.10/dist-packages (from nltk) (8.1.7) Requirement already satisfied: joblib in /usr/local/lib/python3.10/dist-packages (from nltk) (1.3.2) Requirement already satisfied: regex>=2021.8.3 in /usr/local/lib/python3.10/dist-packages (from nltk) (2023.6.3) Requirement already satisfied: tqdm in /usr/local/lib/python3.10/dist-packages (from nltk) (4.66.1)
[nltk_data] Downloading package averaged_perceptron_tagger to [nltk_data] /root/nltk_data... [nltk_data] Package averaged_perceptron_tagger is already up-to- [nltk_data] date! [nltk_data] Downloading package punkt to /root/nltk_data... [nltk_data] Package punkt is already up-to-date! [nltk_data] Downloading package cmudict to /root/nltk_data... [nltk_data] Package cmudict is already up-to-date!
Requirement already satisfied: enchant in /usr/local/lib/python3.10/dist-packages (0.0.1)
In [ ]:
from google.colab import drive
drive.mount('/content/drive')
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
In [ ]:
from pathlib import Path
import re
import pandas as pd
import numpy as np
import random
import string
from collections import Counter
import matplotlib.pyplot as plt
from collections import defaultdict
from wordcloud import WordCloud
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import cmudict
import enchant
from g2p_en import G2p
import inflect
import contractions
import seaborn as sns
from re import match
from multiprocessing import Pool
from nltk.corpus import cmudict
import contractions
from itertools import chain
from IPython.display import display
import tensorflow as tf
from tensorflow.keras.layers import Input, Embedding, GRU, Dense, Dropout, Attention
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.utils import plot_model
import jiwer
from nltk.translate.bleu_score import corpus_bleu, SmoothingFunction
In [ ]:
# Define the file path
path = Path("/content/drive/MyDrive/Dissertation/g_train.txt")
# Read the file content
with open(path, 'r', encoding='utf-8') as file:
lines = file.readlines()
# Number of lines in the dataset
num_lines = len(lines)
print(f"Number of lines in the dataset are: {num_lines}")
# Display first few lines
print("\nThe first 5 lines are:")
for line in lines[:5]:
print(line.strip())
# Total number of words in the dataset
total_words = sum(len(line.split()) for line in lines)
print(f"\nThe Total number of words are: {total_words}")
# Average number of words per line
avg_words_per_line = total_words / num_lines
print(f"The average number of words per line are: {avg_words_per_line:.2f}")
Number of lines in the dataset are: 45839 The first 5 lines are: 5535415699068794046/00001, WHEN YOU'RE COOKING CHIPS AT HOME 5535415699068794046/00002, THE TRADITIONAL CHIP PAN OFTEN STAYS ON THE SHELF 5535415699068794046/00003, THROUGH WHAT THEY CALL A KNIFE BLOCK 5535415699068794046/00004, WHICH INVOLVES FIRING A POTATO DOWN A PIPE 5535415699068794046/00006, APART FROM THE GOLDEN COLOUR AND THE DELICIOUS FLAVOUR The Total number of words are: 375019 The average number of words per line are: 8.18
In [ ]:
# Create inflect engine
p = inflect.engine()
# Function to convert numbers to words
def convert_numbers_to_words(text):
words_with_numbers = []
words = text.split()
for i, word in enumerate(words):
if word.isdigit():
words_with_numbers.append((word, p.number_to_words(word)))
words[i] = words_with_numbers[-1][1]
return ' '.join(words), words_with_numbers
# Function to replace hyphens with spaces
def replace_hyphens_with_spaces(text):
return text.replace('-', ' ')
lines = []
numbers_converted = []
with open(path, encoding='utf-8') as file:
for line in file.readlines():
cleaned_line, numbers = convert_numbers_to_words(re.sub(r'^\d+/\d+,\s*', '', line).strip())
lines.append(cleaned_line)
numbers_converted.extend(numbers)
# Convert to DataFrame and process
df = pd.DataFrame(lines, columns=['sentence'])
df['sentence'] = df['sentence'].apply(str.lstrip)
df['sentence'] = df['sentence'].apply(replace_hyphens_with_spaces)
# Print converted numbers
print("\nNumbers converted to words:")
for number, words in numbers_converted:
print(f"{number}: {words.replace('-', ' ')}")
# Additional insights
print("\nSome additional insights:")
print("Average sentence length:", df['sentence'].str.split().apply(len).mean())
print("Max sentence length:", df['sentence'].str.split().apply(len).max())
print("Min sentence length:", df['sentence'].str.split().apply(len).min())
print("Unique words:", len(set(' '.join(df['sentence']).split())))
print("\nShape:", df.shape)
# Print DataFrame
print(df.head())
Numbers converted to words: 4: four 24: twenty four 1972: one thousand, nine hundred and seventy two 1: one 20: twenty 2: two 2012: two thousand and twelve 110: one hundred and ten 6: six 14: fourteen 31: thirty one 1964: one thousand, nine hundred and sixty four 1965: one thousand, nine hundred and sixty five 69: sixty nine 1: one 1966: one thousand, nine hundred and sixty six 67: sixty seven 1: one 10: ten 230: two hundred and thirty 1968: one thousand, nine hundred and sixty eight 1969: one thousand, nine hundred and sixty nine 80: eighty 17: seventeen 50: fifty 60: sixty 200: two hundred 180: one hundred and eighty 10: ten 20: twenty 195: one hundred and ninety five 249: two hundred and forty nine 300: three hundred 500: five hundred 350: three hundred and fifty 25: twenty five 20: twenty 65: sixty five 10: ten 2005: two thousand and five 000: zero 50: fifty 1: one 40: forty 60: sixty 23: twenty three 5: five 10: ten 10: ten 000: zero 180: one hundred and eighty 54: fifty four 1: one 300: three hundred 45: forty five 50: fifty 5: five 20: twenty 80: eighty 10: ten 100: one hundred 3: three 10: ten 21: twenty one 21: twenty one 400: four hundred 5: five 40: forty 35: thirty five 5: five 1: one 15: fifteen 250: two hundred and fifty 15: fifteen 25: twenty five 27: twenty seven 60: sixty 4: four 500: five hundred 6: six 6: six 20: twenty 68: sixty eight 7: seven 24: twenty four 2: two 3: three 2000: two thousand 40: forty 39: thirty nine 1940: one thousand, nine hundred and forty 75: seventy five 000: zero 100: one hundred 1: one 800: eight hundred 108: one hundred and eight 012: twelve 2012: two thousand and twelve 20: twenty 10: ten 85: eighty five 31: thirty one 300: three hundred 9: nine 3: three 150: one hundred and fifty 20: twenty 500: five hundred 000: zero 100: one hundred 1734: one thousand, seven hundred and thirty four 147: one hundred and forty seven 85: eighty five 55: fifty five 14: fourteen 15: fifteen 80: eighty 000: zero 90: ninety 200: two hundred 25: twenty five 300: three hundred 75: seventy five 10: ten 5: five 22: twenty two 150: one hundred and fifty 45: forty five 2001: two thousand and one 10: ten 6: six 300: three hundred 12: twelve 185: one hundred and eighty five 80: eighty 14: fourteen 30: thirty 40: forty 29: twenty nine 14: fourteen 300: three hundred 1: one 810: eight hundred and ten 18: eighteen 18: eighteen 40: forty 60: sixty 35: thirty five 000: zero 000: zero 9: nine 15: fifteen 17: seventeen 12: twelve 25: twenty five 30: thirty 21: twenty one 300: three hundred 10: ten 2009: two thousand and nine 45: forty five 30: thirty 59: fifty nine 7: seven 24: twenty four 5: five 000: zero 30: thirty 86: eighty six 32: thirty two 10: ten 5: five 70: seventy 100: one hundred 50: fifty 28: twenty eight 65: sixty five 45: forty five 30: thirty 40: forty 000: zero 1642: one thousand, six hundred and forty two 65: sixty five 12: twelve 1527: one thousand, five hundred and twenty seven 150: one hundred and fifty 250: two hundred and fifty 60: sixty 1900: one thousand, nine hundred 140: one hundred and forty 10: ten 12: twelve 13: thirteen 35: thirty five 12: twelve 40: forty 50: fifty 50: fifty 50: fifty 9: nine 5: five 1: one 125: one hundred and twenty five 5: five 28: twenty eight 24: twenty four 000: zero 18: eighteen 78: seventy eight 100: one hundred 10: ten 100: one hundred 18: eighteen 25: twenty five 1979: one thousand, nine hundred and seventy nine 33: thirty three 100: one hundred 100: one hundred 50: fifty 80: eighty 20: twenty 30: thirty 1: one 1709: one thousand, seven hundred and nine 1710: one thousand, seven hundred and ten 30: thirty 50: fifty 400: four hundred 218: two hundred and eighteen 97: ninety seven 24: twenty four 28: twenty eight 4: four 520: five hundred and twenty 32: thirty two 2013: two thousand and thirteen 2013: two thousand and thirteen 10: ten 160: one hundred and sixty 250: two hundred and fifty 2012: two thousand and twelve 78: seventy eight 2010: two thousand and ten 1980: one thousand, nine hundred and eighty 600: six hundred 500: five hundred 1: one 000: zero 30: thirty 400: four hundred 10: ten 200: two hundred 27: twenty seven 5: five 14: fourteen 90: ninety 25: twenty five 20: twenty 41: forty one 25: twenty five 100: one hundred 150: one hundred and fifty 20: twenty 30: thirty 25: twenty five 60: sixty 41: forty one 40: forty 47: forty seven 20: twenty 50: fifty 60: sixty 50: fifty 400: four hundred 1: one 20: twenty 140: one hundred and forty 1: one 80: eighty 20: twenty 95: ninety five 18: eighteen 500: five hundred 40: forty 50: fifty 500: five hundred 1: one 2015: two thousand and fifteen 100: one hundred 2: two 190: one hundred and ninety 400: four hundred 5: five 10: ten 50: fifty 5: five 36: thirty six 000: zero 67: sixty seven 100: one hundred 300: three hundred 125: one hundred and twenty five 30: thirty 11: eleven 16: sixteen 24: twenty four 2: two 4: four 3: three 30: thirty 2010: two thousand and ten 2010: two thousand and ten 2011: two thousand and eleven 1: one 20: twenty 40: forty 40: forty 8: eight 30: thirty 1947: one thousand, nine hundred and forty seven 60: sixty 5: five 60: sixty 12: twelve 12: twelve 1: one 12: twelve 1: one 12: twelve 2: two 2: two 12: twelve 12: twelve 12: twelve 2: two 100: one hundred 50: fifty 100: one hundred 1: one 12: twelve 000: zero 12: twelve 6: six 12: twelve 2000: two thousand 12: twelve 999: nine hundred and ninety nine 14: fourteen 12: twelve 000: zero 4: four 12: twelve 1: one 12: twelve 12: twelve 14: fourteen 12: twelve 000: zero 500: five hundred 500: five hundred 000: zero 10: ten 12: twelve 1: one 12: twelve 12: twelve 300: three hundred 500: five hundred 7: seven 000: zero 12: twelve 12: twelve 2: two 12: twelve 2011: two thousand and eleven 000: zero 14: fourteen 000: zero 63: sixty three 64: sixty four 180: one hundred and eighty 30: thirty 11: eleven 35: thirty five 8: eight 600: six hundred 95: ninety five 000: zero 450: four hundred and fifty 475: four hundred and seventy five 5: five 37: thirty seven 31: thirty one 35: thirty five 65: sixty five 000: zero 20: twenty 1835: one thousand, eight hundred and thirty five 54: fifty four 67: sixty seven 000: zero 000: zero 32: thirty two 000: zero 30: thirty 40: forty 58: fifty eight 000: zero 960: nine hundred and sixty 24: twenty four 27: twenty seven 000: zero 32: thirty two 89: eighty nine 70: seventy 48: forty eight 000: zero 62: sixty two 58: fifty eight 000: zero 12: twelve 69: sixty nine 160: one hundred and sixty 145: one hundred and forty five 500: five hundred 100: one hundred 93: ninety three 500: five hundred 30: thirty 50: fifty 1: one 13: thirteen 200: two hundred 000: zero 6: six 600: six hundred 63: sixty three 400: four hundred 60: sixty 1: one 1: one 000: zero 1: one 1: one 300: three hundred 25: twenty five 000: zero 24: twenty four 90: ninety 110: one hundred and ten 1935: one thousand, nine hundred and thirty five 30: thirty 50: fifty 800: eight hundred 100: one hundred 150: one hundred and fifty 250: two hundred and fifty 22: twenty two 200: two hundred 35: thirty five 2012: two thousand and twelve 1851: one thousand, eight hundred and fifty one 10: ten 24: twenty four 12: twelve 1943: one thousand, nine hundred and forty three 60: sixty 617: six hundred and seventeen 1533: one thousand, five hundred and thirty three 200: two hundred 1537: one thousand, five hundred and thirty seven 56: fifty six 26: twenty six 14: fourteen 2012: two thousand and twelve 20: twenty 30: thirty 21: twenty one 99: ninety nine 15: fifteen 12: twelve 900: nine hundred 1: one 1940: one thousand, nine hundred and forty 53: fifty three 400: four hundred 40: forty 1: one 2: two 52: fifty two 28: twenty eight 25: twenty five 48: forty eight 2008: two thousand and eight 60: sixty 75: seventy five 80: eighty 8: eight 5: five 100: one hundred 62: sixty two 12: twelve 11: eleven 371: three hundred and seventy one 371: three hundred and seventy one 000: zero 11: eleven 80: eighty 23: twenty three 40: forty 000: zero 6: six 2003: two thousand and three 93: ninety three 6: six 20: twenty 2003: two thousand and three 205: two hundred and five 69: sixty nine 33: thirty three 000: zero 59: fifty nine 56: fifty six 000: zero 40: forty 15: fifteen 70: seventy 50: fifty 60: sixty 29: twenty nine 45: forty five 80: eighty 100: one hundred 40: forty 8: eight 30: thirty 1: one 12: twelve 000: zero 55: fifty five 000: zero 10: ten 135: one hundred and thirty five 160: one hundred and sixty 110: one hundred and ten 90: ninety 15: fifteen 150: one hundred and fifty 16: sixteen 170: one hundred and seventy 20: twenty 54: fifty four 195: one hundred and ninety five 10: ten 450: four hundred and fifty 20: twenty 450: four hundred and fifty 400: four hundred 1: one 12: twelve 8: eight 200: two hundred 2: two 1: one 2010: two thousand and ten 25: twenty five 46: forty six 000: zero 62: sixty two 59: fifty nine 1995: one thousand, nine hundred and ninety five 000: zero 51: fifty one 56: fifty six 000: zero 000: zero 24: twenty four 66: sixty six 36: thirty six 57: fifty seven 000: zero 200: two hundred 1: one 76: seventy six 40: forty 20: twenty 16: sixteen 2012: two thousand and twelve 25: twenty five 50: fifty 14: fourteen 2013: two thousand and thirteen 61: sixty one 22: twenty two 11: eleven 66: sixty six 67: sixty seven 1846: one thousand, eight hundred and forty six 75: seventy five 120: one hundred and twenty 75: seventy five 5: five 5: five 11: eleven 3: three 50: fifty 2013: two thousand and thirteen 20: twenty 40: forty 3: three 35: thirty five 24: twenty four 12: twelve 10: ten 18: eighteen 18: eighteen 65: sixty five 100: one hundred 50: fifty 57: fifty seven 300: three hundred 50: fifty 8: eight 75: seventy five 30: thirty 50: fifty 28: twenty eight 10: ten 20: twenty 400: four hundred 500: five hundred 150: one hundred and fifty 7: seven 18: eighteen 26: twenty six 1984: one thousand, nine hundred and eighty four 462: four hundred and sixty two 2014: two thousand and fourteen 72: seventy two 1820: one thousand, eight hundred and twenty 2: two 1754: one thousand, seven hundred and fifty four 10: ten 10: ten 60: sixty 75: seventy five 94: ninety four 50: fifty 100: one hundred 370: three hundred and seventy 40: forty 50: fifty 51: fifty one 200: two hundred 1: one 70: seventy 100: one hundred 200: two hundred 1893: one thousand, eight hundred and ninety three 1991: one thousand, nine hundred and ninety one 100: one hundred 25: twenty five 100: one hundred 30: thirty 10: ten 1: one 200: two hundred 300: three hundred 4: four 120: one hundred and twenty 180: one hundred and eighty 230: two hundred and thirty 1946: one thousand, nine hundred and forty six 20: twenty 200: two hundred 400: four hundred 200: two hundred 500: five hundred 800: eight hundred 10: ten 20: twenty 200: two hundred 70: seventy 10: ten 30: thirty 10: ten 2: two 150: one hundred and fifty 31: thirty one 30: thirty 50: fifty 20: twenty 30: thirty 40: forty 10: ten 500: five hundred 12: twelve 30: thirty 25: twenty five 30: thirty 139: one hundred and thirty nine 25: twenty five 10: ten 20: twenty 638: six hundred and thirty eight 13: thirteen 10: ten 12: twelve 12: twelve 28: twenty eight 35: thirty five 60: sixty 100: one hundred 5: five 40: forty 12: twelve 25: twenty five 25: twenty five 9: nine 5: five 15: fifteen 25: twenty five 000: zero 150: one hundred and fifty 100: one hundred 150: one hundred and fifty 100: one hundred 150: one hundred and fifty 30: thirty 45: forty five 930: nine hundred and thirty 9: nine 4: four 1: one 48: forty eight 400: four hundred 2: two 707: seven hundred and seven 707: seven hundred and seven 50: fifty 2: two 50: fifty 2: two 350: three hundred and fifty 14: fourteen 24: twenty four 25: twenty five 30: thirty 12: twelve 50: fifty 2: two 975: nine hundred and seventy five 300: three hundred 200: two hundred 2: two 600: six hundred 000: zero 5: five 12: twelve 45: forty five 120: one hundred and twenty 800: eight hundred 1: one 38: thirty eight 1: one 2012: two thousand and twelve 1: one 30: thirty 50: fifty 60: sixty 5: five 300: three hundred 200: two hundred 1947: one thousand, nine hundred and forty seven 52: fifty two 90: ninety 2000: two thousand 40: forty 100: one hundred 000: zero 95: ninety five 40: forty 14: fourteen 2: two 79: seventy nine 45: forty five 100: one hundred 500: five hundred 600: six hundred 100: one hundred 2: two 194: one hundred and ninety four 15: fifteen 1779: one thousand, seven hundred and seventy nine 60: sixty 25: twenty five 200: two hundred 250: two hundred and fifty 30: thirty 600: six hundred 100: one hundred 30: thirty 50: fifty 80: eighty 65: sixty five 25: twenty five 20: twenty 180: one hundred and eighty 250: two hundred and fifty 50: fifty 5: five 000: zero 10: ten 200: two hundred 1800: one thousand, eight hundred 1910: one thousand, nine hundred and ten 10: ten 12: twelve 75: seventy five 100: one hundred 15: fifteen 70: seventy 60: sixty 99: ninety nine 45: forty five 5: five 000: zero 40: forty 60: sixty 12: twelve 15: fifteen 400: four hundred 600: six hundred 1: one 853: eight hundred and fifty three 46: forty six 500: five hundred 25: twenty five 1888: one thousand, eight hundred and eighty eight 250: two hundred and fifty 10: ten 5: five 1958: one thousand, nine hundred and fifty eight 85: eighty five 600: six hundred 800: eight hundred 2006: two thousand and six 15: fifteen 200: two hundred 300: three hundred 38: thirty eight 28: twenty eight 100: one hundred 60: sixty 300: three hundred 300: three hundred 75: seventy five 1899: one thousand, eight hundred and ninety nine 300: three hundred 2: two 200: two hundred 21: twenty one 100: one hundred 40: forty 125: one hundred and twenty five 24: twenty four 50: fifty 23: twenty three 13: thirteen 10: ten 120: one hundred and twenty 50: fifty 38: thirty eight 60: sixty 30: thirty 150: one hundred and fifty 12: twelve 2: two 32: thirty two 2012: two thousand and twelve 800: eight hundred 1907: one thousand, nine hundred and seven 50: fifty 50: fifty 35: thirty five 25: twenty five 1: one 90: ninety 450: four hundred and fifty 900: nine hundred 400: four hundred 500: five hundred 750: seven hundred and fifty 58: fifty eight 370: three hundred and seventy 42: forty two 10: ten 150: one hundred and fifty 50: fifty 1901: one thousand, nine hundred and one 5: five 10: ten 30: thirty 20: twenty 4: four 1985: one thousand, nine hundred and eighty five 175: one hundred and seventy five 12: twelve 11: eleven 60: sixty 12: twelve 100: one hundred 100: one hundred 500: five hundred 11: eleven 100: one hundred 300: three hundred 50: fifty 48: forty eight 30: thirty 1924: one thousand, nine hundred and twenty four 160: one hundred and sixty 80: eighty 24: twenty four 15: fifteen 600: six hundred 2: two 1: one 20: twenty 40: forty 50: fifty 15: fifteen 10: ten 10: ten 25: twenty five 70: seventy 20: twenty 100: one hundred 200: two hundred 15: fifteen 34: thirty four 20: twenty 20: twenty 26: twenty six 3: three 32: thirty two 20: twenty 5: five 60: sixty 800: eight hundred 100: one hundred 20: twenty 65: sixty five 50: fifty 20: twenty 10: ten 150: one hundred and fifty 2: two 63: sixty three 33: thirty three 100: one hundred 000: zero 000: zero 500: five hundred 90: ninety 50: fifty 20: twenty 1900: one thousand, nine hundred 10: ten 200: two hundred 17: seventeen 30: thirty 24: twenty four 120: one hundred and twenty 100: one hundred 100: one hundred 1: one 120: one hundred and twenty 27: twenty seven 1934: one thousand, nine hundred and thirty four 673: six hundred and seventy three 29: twenty nine 30: thirty 6: six 600: six hundred 200: two hundred 62: sixty two 100: one hundred 3: three 180: one hundred and eighty 142: one hundred and forty two 100: one hundred 1958: one thousand, nine hundred and fifty eight 25: twenty five 16: sixteen 300: three hundred 400: four hundred 000: zero 30: thirty 10: ten 12: twelve 24: twenty four 300: three hundred 40: forty 80: eighty 400: four hundred 8: eight 200: two hundred 300: three hundred 800: eight hundred 12: twelve 000: zero 20: twenty 7: seven 0: zero 40: forty 40: forty 75: seventy five 20: twenty 4: four 4: four 29: twenty nine 1770: one thousand, seven hundred and seventy 000: zero 627: six hundred and twenty seven 465: four hundred and sixty five 375: three hundred and seventy five 385: three hundred and eighty five 3: three 2011: two thousand and eleven 1942: one thousand, nine hundred and forty two 60: sixty 7: seven 18: eighteen 000: zero 2001: two thousand and one 11: eleven 8: eight 5: five 180: one hundred and eighty 30: thirty 75: seventy five 1993: one thousand, nine hundred and ninety three 1978: one thousand, nine hundred and seventy eight 18: eighteen 20: twenty 3: three 1: one 24: twenty four 20: twenty 24: twenty four 150: one hundred and fifty 15: fifteen 1850: one thousand, eight hundred and fifty 1035: one thousand and thirty five 100: one hundred 000: zero 20: twenty 1887: one thousand, eight hundred and eighty seven 26: twenty six 15: fifteen 155: one hundred and fifty five 30: thirty 240: two hundred and forty 15: fifteen 115: one hundred and fifteen 10: ten 1: one 10: ten 94: ninety four 24: twenty four 2: two 2: two 300: three hundred 450: four hundred and fifty 65: sixty five 100: one hundred 100: one hundred 195: one hundred and ninety five 300: three hundred 165: one hundred and sixty five 37: thirty seven 1: one 28: twenty eight 1814: one thousand, eight hundred and fourteen 200: two hundred 100: one hundred 75: seventy five 100: one hundred 450: four hundred and fifty 32: thirty two 20: twenty 246: two hundred and forty six 270: two hundred and seventy 400: four hundred 125: one hundred and twenty five 380: three hundred and eighty 125: one hundred and twenty five 200: two hundred 250: two hundred and fifty 21: twenty one 1: one 450: four hundred and fifty 40: forty 15: fifteen 170: one hundred and seventy 15: fifteen 700: seven hundred 9: nine 200: two hundred 400: four hundred 75: seventy five 600: six hundred 300: three hundred 170: one hundred and seventy 10: ten 2014: two thousand and fourteen 000: zero 3: three 000: zero 30: thirty 1948: one thousand, nine hundred and forty eight 15: fifteen 50: fifty 20: twenty 18: eighteen 18: eighteen 32: thirty two 100: one hundred 1: one 500: five hundred 1338: one thousand, three hundred and thirty eight 12: twelve 24: twenty four 000: zero 10: ten 1545: one thousand, five hundred and forty five 25: twenty five 2008: two thousand and eight 500: five hundred 112: one hundred and twelve 16: sixteen 35: thirty five 500: five hundred 10: ten 4: four 700: seven hundred 17: seventeen 17: seventeen 2: two 8: eight 17: seventeen 1: one 100: one hundred 2014: two thousand and fourteen 515: five hundred and fifteen 210: two hundred and ten 125: one hundred and twenty five 550: five hundred and fifty 625: six hundred and twenty five 430: four hundred and thirty 150: one hundred and fifty 15: fifteen 75: seventy five 190: one hundred and ninety 133: one hundred and thirty three 295: two hundred and ninety five 37: thirty seven 000: zero 65: sixty five 425: four hundred and twenty five 147: one hundred and forty seven 8: eight 10: ten 62: sixty two 110: one hundred and ten 15: fifteen 20: twenty 20: twenty 21: twenty one 14: fourteen 3: three 25: twenty five 30: thirty 100: one hundred 110: one hundred and ten 800: eight hundred 200: two hundred 000: zero 20: twenty 1: one 400: four hundred 12: twelve 24: twenty four 6000: six thousand 135: one hundred and thirty five 20: twenty 30: thirty 50: fifty 15: fifteen 40: forty 000: zero 1905: one thousand, nine hundred and five 100: one hundred 3: three 100: one hundred 100: one hundred 50: fifty 2014: two thousand and fourteen 1984: one thousand, nine hundred and eighty four 2014: two thousand and fourteen 14: fourteen 87: eighty seven 3: three 20: twenty 65: sixty five 170: one hundred and seventy 11: eleven 2001: two thousand and one 96: ninety six 20: twenty 1945: one thousand, nine hundred and forty five 1965: one thousand, nine hundred and sixty five 1739: one thousand, seven hundred and thirty nine 28: twenty eight 161: one hundred and sixty one 35: thirty five 000: zero 14: fourteen 70: seventy 10: ten 20: twenty 40: forty 1606: one thousand, six hundred and six 10: ten 20: twenty 25: twenty five 300: three hundred 500: five hundred 200: two hundred 1996: one thousand, nine hundred and ninety six 95: ninety five 18: eighteen 59: fifty nine 23: twenty three 18: eighteen 12: twelve 21: twenty one 1976: one thousand, nine hundred and seventy six 25: twenty five 9: nine 458: four hundred and fifty eight 40: forty 11: eleven 15: fifteen 11: eleven 75: seventy five 15: fifteen 15: fifteen 20: twenty 1: one 10: ten 20: twenty 40: forty 85: eighty five 35: thirty five 5: five 1949: one thousand, nine hundred and forty nine 20: twenty 40: forty 11: eleven 70: seventy 30: thirty 000: zero 50: fifty 000: zero 65: sixty five 2016: two thousand and sixteen 28: twenty eight 100: one hundred 150: one hundred and fifty 2050: two thousand and fifty 100: one hundred 1963: one thousand, nine hundred and sixty three 20: twenty 80: eighty 100: one hundred 20: twenty 17: seventeen 000: zero 300: three hundred 46: forty six 700: seven hundred 1527: one thousand, five hundred and twenty seven 82: eighty two 60: sixty 18: eighteen 20: twenty 55: fifty five 1958: one thousand, nine hundred and fifty eight 16: sixteen 13: thirteen 12: twelve 000: zero 100: one hundred 14: fourteen 30: thirty 155: one hundred and fifty five 50: fifty 22: twenty two 000: zero 000: zero 40: forty 100: one hundred 150: one hundred and fifty 52: fifty two 11: eleven 24: twenty four 300: three hundred 400: four hundred 30: thirty 40: forty 30: thirty 27: twenty seven 46: forty six 300: three hundred 10: ten 4: four 7: seven 40: forty 5: five 5: five 80: eighty 80: eighty 150: one hundred and fifty 76: seventy six 20: twenty 2013: two thousand and thirteen 40: forty 20: twenty 930: nine hundred and thirty 1800: one thousand, eight hundred 680: six hundred and eighty 500: five hundred 80: eighty 90: ninety 95: ninety five 50: fifty 11: eleven 24: twenty four 300: three hundred 3: three 000: zero 89: eighty nine 20: twenty 40: forty 5: five 191: one hundred and ninety one 17: seventeen 1963: one thousand, nine hundred and sixty three 40: forty 60: sixty 40: forty 50: fifty 4: four 96: ninety six 0: zero 150: one hundred and fifty 180: one hundred and eighty 5: five 200: two hundred 300: three hundred 26: twenty six 11: eleven 15: fifteen 20: twenty 200: two hundred 120: one hundred and twenty 1: one 20: twenty 90: ninety 100: one hundred 200: two hundred 300: three hundred 100: one hundred 10: ten 20: twenty 82: eighty two 20: twenty 200: two hundred 100: one hundred 200: two hundred 100: one hundred 30: thirty 60: sixty 200: two hundred 200: two hundred 3: three 97: ninety seven 5: five 1: one 5: five 85: eighty five 40: forty 15: fifteen 20: twenty 2: two 4: four 2: two 500: five hundred 20: twenty 20: twenty 25: twenty five 30: thirty 27: twenty seven 000: zero 50: fifty 12: twelve 1: one 12: twelve 1: one 12: twelve 30: thirty 1: one 2: two 000: zero 520: five hundred and twenty 30: thirty 18: eighteen 18: eighteen 16: sixteen 18: eighteen 15: fifteen 20: twenty 7: seven 460: four hundred and sixty 1: one 30: thirty 3: three 2: two 12: twelve 2022: two thousand and twenty two 63: sixty three 3: three 15: fifteen 10: ten 2010: two thousand and ten 1923: one thousand, nine hundred and twenty three 11: eleven 25: twenty five 21: twenty one 100: one hundred 140: one hundred and forty 150: one hundred and fifty 200: two hundred 90: ninety 75: seventy five 20: twenty 16: sixteen 1723: one thousand, seven hundred and twenty three 150: one hundred and fifty 100: one hundred 200: two hundred 50: fifty 100: one hundred 90: ninety 200: two hundred 300: three hundred 30: thirty 24: twenty four 5: five 000: zero 000: zero 1: one 70: seventy 2: two 84: eighty four 1988: one thousand, nine hundred and eighty eight 15: fifteen 150: one hundred and fifty 29: twenty nine 38: thirty eight 1991: one thousand, nine hundred and ninety one 40: forty 55: fifty five 15: fifteen 20: twenty 47: forty seven 28: twenty eight 30: thirty 25: twenty five 98: ninety eight 230: two hundred and thirty 11: eleven 30: thirty 28: twenty eight 1605: one thousand, six hundred and five 40: forty 16: sixteen 80: eighty 120: one hundred and twenty 8: eight 60: sixty 20: twenty 30: thirty 1989: one thousand, nine hundred and eighty nine 20: twenty 18: eighteen 1987: one thousand, nine hundred and eighty seven 1926: one thousand, nine hundred and twenty six 13: thirteen 14: fourteen 250: two hundred and fifty 150: one hundred and fifty 400: four hundred 500: five hundred 120: one hundred and twenty 40: forty 40: forty 7: seven 9: nine 12: twelve 30: thirty 90: ninety 15: fifteen 150: one hundred and fifty 100: one hundred 180: one hundred and eighty 000: zero 175: one hundred and seventy five 150: one hundred and fifty 1828: one thousand, eight hundred and twenty eight 70: seventy 56: fifty six 10: ten 15: fifteen 20: twenty 130: one hundred and thirty 15: fifteen 20: twenty 12: twelve 330: three hundred and thirty 400: four hundred 100: one hundred 100: one hundred 100: one hundred 1911: one thousand, nine hundred and eleven 10: ten 340: three hundred and forty 75: seventy five 60: sixty 1828: one thousand, eight hundred and twenty eight 70: seventy 14: fourteen 15: fifteen 1651: one thousand, six hundred and fifty one 95: ninety five 20: twenty 50: fifty 60: sixty 520: five hundred and twenty 750: seven hundred and fifty 1948: one thousand, nine hundred and forty eight 800: eight hundred 20: twenty 25: twenty five 25: twenty five 60: sixty 500: five hundred 000: zero 58: fifty eight 20: twenty 750: seven hundred and fifty 90: ninety 10: ten 15: fifteen 1: one 12: twelve 000: zero 000: zero 20: twenty 24: twenty four 34: thirty four 729: seven hundred and twenty nine 20: twenty 10: ten 100: one hundred 130: one hundred and thirty 26: twenty six 000: zero 800: eight hundred 37: thirty seven 34: thirty four 90: ninety 15: fifteen 465: four hundred and sixty five 25: twenty five 000: zero 800: eight hundred 15: fifteen 3: three 50: fifty 2: two 18: eighteen 400: four hundred 415: four hundred and fifteen 30: thirty 19: nineteen 23: twenty three 400: four hundred 1706: one thousand, seven hundred and six 16: sixteen 20: twenty 200: two hundred 21: twenty one 500: five hundred 125: one hundred and twenty five 45: forty five 28: twenty eight 29: twenty nine 54: fifty four 16: sixteen 6: six 60: sixty 20: twenty 67: sixty seven 22: twenty two 60: sixty 000: zero 15: fifteen 200: two hundred 5: five 600: six hundred 15: fifteen 1933: one thousand, nine hundred and thirty three 10: ten 40: forty 90: ninety 000: zero 100: one hundred 000: zero 55: fifty five 400: four hundred 30: thirty 000: zero 246: two hundred and forty six 68: sixty eight 30: thirty 135: one hundred and thirty five 160: one hundred and sixty 175: one hundred and seventy five 000: zero 000: zero 2006: two thousand and six 57: fifty seven 185: one hundred and eighty five 125: one hundred and twenty five 205: two hundred and five 950: nine hundred and fifty 60: sixty 125: one hundred and twenty five 450: four hundred and fifty 385: three hundred and eighty five 5: five 7: seven 3: three 31: thirty one 80: eighty 69: sixty nine 8: eight 000: zero 350: three hundred and fifty 400: four hundred 550: five hundred and fifty 600: six hundred 650: six hundred and fifty 000: zero 000: zero 165: one hundred and sixty five 15: fifteen 90: ninety 20: twenty 10: ten 85: eighty five 100: one hundred 2: two 120: one hundred and twenty 13: thirteen 260: two hundred and sixty 27: twenty seven 61: sixty one 300: three hundred 165: one hundred and sixty five 175: one hundred and seventy five 50: fifty 225: two hundred and twenty five 35: thirty five 35: thirty five 42: forty two 500: five hundred 70: seventy 700: seven hundred 245: two hundred and forty five 90: ninety 120: one hundred and twenty 200: two hundred 65: sixty five 50: fifty 2: two 60: sixty 8: eight 425: four hundred and twenty five 225: two hundred and twenty five 1: one 200: two hundred 99: ninety nine 50: fifty 000: zero 400: four hundred 500: five hundred 100: one hundred 300: three hundred 500: five hundred 99: ninety nine 3: three 1: one 35: thirty five 270: two hundred and seventy 32: thirty two 32: thirty two 30: thirty 20: twenty 130: one hundred and thirty 300: three hundred 23: twenty three 12: twelve 22: twenty two 20: twenty 22: twenty two 29: twenty nine 000: zero 60: sixty 10: ten 15: fifteen 239: two hundred and thirty nine 14: fourteen 250: two hundred and fifty 16: sixteen 350: three hundred and fifty 370: three hundred and seventy 350: three hundred and fifty 000: zero 99: ninety nine 53: fifty three 5: five 75: seventy five 40: forty 40: forty 30: thirty 2017: two thousand and seventeen 2: two 16: sixteen 18: eighteen 5: five 10: ten 20: twenty 6: six 15: fifteen 20: twenty 000: zero 1918: one thousand, nine hundred and eighteen 1923: one thousand, nine hundred and twenty three 53: fifty three 55: fifty five 130: one hundred and thirty 12: twelve 69: sixty nine 160: one hundred and sixty 36: thirty six 200: two hundred 100: one hundred 150: one hundred and fifty 95: ninety five 9: nine 155: one hundred and fifty five 10: ten 25: twenty five 30: thirty 235: two hundred and thirty five 125: one hundred and twenty five 110: one hundred and ten 1981: one thousand, nine hundred and eighty one 12: twelve 50: fifty 10: ten 1: one 1967: one thousand, nine hundred and sixty seven 90: ninety 400: four hundred 40: forty 4: four 000: zero 50: fifty 30: thirty 100: one hundred 25: twenty five 8: eight 1745: one thousand, seven hundred and forty five 000: zero 8: eight 5: five 2016: two thousand and sixteen 10: ten 1: one 15: fifteen 80: eighty 35: thirty five 20: twenty 100: one hundred 50: fifty 40: forty 10: ten 10: ten 620: six hundred and twenty 12: twelve 350: three hundred and fifty 500: five hundred 14: fourteen 10: ten 11: eleven 11: eleven 10: ten 647: six hundred and forty seven 101: one hundred and one 30: thirty 200: two hundred 200: two hundred 300: three hundred 140: one hundred and forty 1918: one thousand, nine hundred and eighteen 1961: one thousand, nine hundred and sixty one 1940: one thousand, nine hundred and forty 18: eighteen 250: two hundred and fifty 165: one hundred and sixty five 85: eighty five 42: forty two 820: eight hundred and twenty 44: forty four 250: two hundred and fifty 1759: one thousand, seven hundred and fifty nine 100: one hundred 15: fifteen 60: sixty 20: twenty 30: thirty 23: twenty three 13: thirteen 7: seven 000: zero 000: zero 9000: nine thousand 966: nine hundred and sixty six 100: one hundred 55: fifty five 15: fifteen 500: five hundred 14: fourteen 65: sixty five 96: ninety six 5: five 20: twenty 35: thirty five 200: two hundred 30: thirty 1940: one thousand, nine hundred and forty 15: fifteen 18: eighteen 5: five 30: thirty 20: twenty 100: one hundred 14: fourteen 95: ninety five 4: four 2: two 1960: one thousand, nine hundred and sixty 1963: one thousand, nine hundred and sixty three 57: fifty seven 900: nine hundred 6: six 90: ninety 40: forty 000: zero 2: two 30: thirty 000: zero 2009: two thousand and nine 10: ten 10: ten 40: forty 60: sixty 25: twenty five 35: thirty five 78: seventy eight 1826: one thousand, eight hundred and twenty six 12: twelve 6: six 25: twenty five 27: twenty seven 1: one 300: three hundred 400: four hundred 100: one hundred 60: sixty 425: four hundred and twenty five 000: zero 10: ten 13: thirteen 425: four hundred and twenty five 6: six 100: one hundred 70: seventy 6: six 100: one hundred 17: seventeen 100: one hundred 120: one hundred and twenty 50: fifty 5: five 12: twelve 75: seventy five 10: ten 209: two hundred and nine 50: fifty 260: two hundred and sixty 260: two hundred and sixty 160: one hundred and sixty 14: fourteen 50: fifty 26: twenty six 18: eighteen 8: eight 70: seventy 20: twenty 500: five hundred 1: one 3: three 12: twelve 1878: one thousand, eight hundred and seventy eight 30: thirty 40: forty 6: six 2015: two thousand and fifteen 7: seven 12: twelve 1977: one thousand, nine hundred and seventy seven 30: thirty 1954: one thousand, nine hundred and fifty four 20: twenty 12: twelve 2015: two thousand and fifteen 2: two 54: fifty four 10: ten 24: twenty four 300: three hundred 218: two hundred and eighteen 35: thirty five 1951: one thousand, nine hundred and fifty one 20: twenty 90: ninety 90: ninety 15: fifteen 854: eight hundred and fifty four 1985: one thousand, nine hundred and eighty five 20: twenty 000: zero 30: thirty 3: three 3: three 5: five 49: forty nine 350: three hundred and fifty 100: one hundred 200: two hundred 105: one hundred and five 12: twelve 1: one 5: five 400: four hundred 5: five 1: one 2: two 000: zero 2011: two thousand and eleven 1911: one thousand, nine hundred and eleven 1967: one thousand, nine hundred and sixty seven 15: fifteen 11: eleven 2016: two thousand and sixteen 646: six hundred and forty six 2: two 2: two 1973: one thousand, nine hundred and seventy three 65: sixty five 100: one hundred 150: one hundred and fifty 600: six hundred 400: four hundred 500: five hundred 1994: one thousand, nine hundred and ninety four 17: seventeen 30: thirty 15: fifteen 200: two hundred 300: three hundred 15: fifteen 2016: two thousand and sixteen 50: fifty 50: fifty 100: one hundred 520: five hundred and twenty 150: one hundred and fifty 2300: two thousand, three hundred 24: twenty four 15: fifteen 40: forty 10: ten 200: two hundred 1: one 700: seven hundred 200: two hundred 18: eighteen 15: fifteen 20: twenty 14: fourteen 69: sixty nine 3: three 200: two hundred 25: twenty five 930: nine hundred and thirty 10: ten 2016: two thousand and sixteen 2016: two thousand and sixteen 2016: two thousand and sixteen 180: one hundred and eighty 13: thirteen 7: seven 1852: one thousand, eight hundred and fifty two 1: one 35: thirty five 150: one hundred and fifty 42: forty two 2: two 100: one hundred 70: seventy 100: one hundred 20: twenty 30: thirty 4: four 60: sixty 100: one hundred 100: one hundred 700: seven hundred 20: twenty 000: zero 70: seventy 1: one 20: twenty 65: sixty five 70: seventy 20: twenty 20: twenty 100: one hundred 000: zero 30: thirty 110: one hundred and ten 22: twenty two 24: twenty four 120: one hundred and twenty 10: ten 12: twelve 14: fourteen 5: five 31: thirty one 12: twelve 7: seven 9: nine 2: two 74: seventy four 1: one 2009: two thousand and nine 40: forty 1969: one thousand, nine hundred and sixty nine 1998: one thousand, nine hundred and ninety eight 14: fourteen 1973: one thousand, nine hundred and seventy three 000: zero 100: one hundred 25: twenty five 20: twenty 48: forty eight 17: seventeen 24: twenty four 4: four 000: zero 2: two 55: fifty five 4: four 24: twenty four 28: twenty eight 27: twenty seven 27: twenty seven 30: thirty 30: thirty 12: twelve 600: six hundred 500: five hundred 1835: one thousand, eight hundred and thirty five 22: twenty two 20: twenty 45: forty five 60: sixty 89: eighty nine 40: forty 0: zero 12: twelve 10: ten 1826: one thousand, eight hundred and twenty six 000: zero 5: five 1848: one thousand, eight hundred and forty eight 14: fourteen 16: sixteen 11: eleven 800: eight hundred 300: three hundred 000: zero 100: one hundred 600: six hundred 125: one hundred and twenty five 20: twenty 85: eighty five 40: forty 8: eight 18: eighteen 73: seventy three 40: forty 2: two 1948: one thousand, nine hundred and forty eight 80: eighty 500: five hundred 200: two hundred 400: four hundred 1948: one thousand, nine hundred and forty eight 20: twenty 1790: one thousand, seven hundred and ninety 1984: one thousand, nine hundred and eighty four 2: two 000: zero 400: four hundred 150: one hundred and fifty 15: fifteen 25: twenty five 000: zero 1: one 000: zero 47: forty seven 22: twenty two 5: five 40: forty 100: one hundred 40: forty 10: ten 2: two 1: one 49: forty nine 100: one hundred 350: three hundred and fifty 155: one hundred and fifty five 90: ninety 95: ninety five 000: zero 325: three hundred and twenty five 320: three hundred and twenty 325: three hundred and twenty five 10: ten 20: twenty 15: fifteen 30: thirty 80: eighty 60: sixty 15: fifteen 1: one 10: ten 20: twenty 2: two 1848: one thousand, eight hundred and forty eight 1842: one thousand, eight hundred and forty two 90: ninety 25: twenty five 30: thirty 10: ten Some additional insights: Average sentence length: 7.231702262265756 Max sentence length: 28 Min sentence length: 3 Unique words: 17388 Shape: (45839, 1) sentence 0 WHEN YOU'RE COOKING CHIPS AT HOME 1 THE TRADITIONAL CHIP PAN OFTEN STAYS ON THE SHELF 2 THROUGH WHAT THEY CALL A KNIFE BLOCK 3 WHICH INVOLVES FIRING A POTATO DOWN A PIPE 4 APART FROM THE GOLDEN COLOUR AND THE DELICIOUS...
In [ ]:
# Define a color palette
palette = {
"histogram": "#2980B9",
"bar1": "#3498DB",
"bar2": "#E74C3C",
"bar3": "#1ABC9C"
}
# 1. Distribution of Sentence Lengths
plt.figure(figsize=(12, 6))
sentence_lengths = df['sentence'].str.split().apply(len)
sns.histplot(sentence_lengths, bins=30, color=palette["histogram"], edgecolor='black', alpha=0.7)
plt.title('Distribution of Sentence Lengths', fontsize=15)
plt.xlabel('Sentence Length (words)', fontsize=12)
plt.ylabel('Frequency', fontsize=12)
plt.grid(axis='y', linestyle='--', alpha=0.5)
plt.tight_layout()
plt.show()
# 2. Proportion of Sentences with Numbers Converted vs. Total Sentences
plt.figure(figsize=(10, 6))
labels = ['Sentences with Numbers Converted', 'Other Sentences']
values = [len(numbers_converted), len(df) - len(numbers_converted)]
bars = plt.bar(labels, values, color=[palette["bar1"], palette["bar2"]])
for bar in bars:
yval = bar.get_height()
plt.text(bar.get_x() + bar.get_width()/2, yval + 100,
f'{yval} ({yval/len(df)*100:.1f}%)',
ha='center', va='bottom', fontweight='bold')
plt.title('Proportion of Sentences with Numbers Converted')
plt.ylabel('Count')
plt.tight_layout()
plt.show()
# 3. Top 10 Most Frequently Converted Numbers
num_freq = Counter([num for num, word in numbers_converted])
common_nums = num_freq.most_common(10)
nums, counts = zip(*common_nums)
plt.figure(figsize=(12, 7))
bars = plt.barh(nums, counts, color=palette["bar3"])
plt.gca().invert_yaxis() # To display the most frequent number at the top
for bar in bars:
plt.text(bar.get_width() - (0.02 * max(counts)), bar.get_y() + bar.get_height()/2,
str(int(bar.get_width())), va='center', ha='right', color='white', fontweight='bold')
plt.title('Top 10 Most Frequently Converted Numbers', fontsize=15)
plt.xlabel('Frequency', fontsize=12)
plt.ylabel('Number', fontsize=12)
plt.tight_layout()
plt.show()
In [ ]:
sentences = df['sentence'].tolist()
# Calculate Unique Word Count
unique_words = set(word for sentence in sentences for word in sentence.split())
print(f"Number of unique words: {len(unique_words)}")
# Initial Letters Distribution
initial_letters = [word[0].lower() for sentence in sentences for word in sentence.split()]
initial_letter_freq = Counter(initial_letters)
# Vowel and Consonant Distribution
vowels = set("aeiou")
num_vowels = sum(1 for word in ''.join(sentences).lower() if word in vowels)
num_consonants = sum(1 for word in ''.join(sentences).lower() if word.isalpha() and word not in vowels)
print(f"\nThe number of vowels are: {num_vowels}")
print(f"The number of consonants are: {num_consonants}")
Number of unique words: 17388 The number of vowels are: 534404 The number of consonants are: 858424
In [ ]:
# Calculate sentence lengths
sentence_lengths = [len(nltk.word_tokenize(line)) for line in df['sentence']]
# Statistics
average_length = np.mean(sentence_lengths)
shortest_length = np.min(sentence_lengths)
longest_length = np.max(sentence_lengths)
# Print statistics
print("Average sentence length:", average_length)
print("Median sentence length:", np.median(sentence_lengths))
print("Standard deviation of sentence length:", np.std(sentence_lengths, ddof=1))
print("Minimum sentence length:", shortest_length)
print("Maximum sentence length:", longest_length)
# Histogram for Sentence Lengths Distribution
plt.figure(figsize=(10, 6))
plt.hist(sentence_lengths, bins=30, edgecolor='k', alpha=0.7, color="#3498DB")
plt.title('Sentence Lengths Distribution')
plt.xlabel('Sentence Length')
plt.ylabel('Number of Sentences')
plt.show()
# Bar plot for Average, Shortest, and Longest sentence lengths
plt.figure(figsize=(10, 6))
sentence_labels = ['Average', 'Shortest', 'Longest']
lengths = [average_length, shortest_length, longest_length]
sns.barplot(x=sentence_labels, y=lengths, palette="Blues_d")
plt.title('Sentence Lengths Overview')
plt.ylabel('Number of Words')
plt.show()
# Unique Word Count Visualization
plt.figure(figsize=(5, 6))
sns.barplot(x=['Unique Words'], y=[len(unique_words)], palette="Purples_d")
plt.title('Unique Word Count')
plt.show()
# Vowel vs. Consonant Distribution Visualization
plt.figure(figsize=(8, 8))
labels = ['Vowels', 'Consonants']
sizes = [num_vowels, num_consonants]
colors = ['#ff9999','#66b2b2']
plt.pie(sizes, labels=labels, colors=colors, autopct='%1.1f%%', startangle=90)
plt.title('Vowel vs. Consonant Distribution')
plt.axis('equal') # Equal aspect ratio ensures pie is drawn as a circle.
plt.show()
# Initial Letter Distribution Visualization
plt.figure(figsize=(14, 8))
letters, counts = zip(*initial_letter_freq.most_common())
sns.barplot(x=list(letters), y=list(counts), palette="viridis")
plt.title('Initial Letter Distribution')
plt.xlabel('Initial Letter')
plt.ylabel('Count')
plt.show()
# Number-to-Word Conversions Table
print("\nNumbers converted to words:")
for number, words in numbers_converted:
print(f"{number}: {words.replace('-', ' ')}")
# Insights from DataFrame Visualization
plt.figure(figsize=(10, 6))
df_lengths = [df['sentence'].str.split().apply(len).mean(), df['sentence'].str.split().apply(len).min(), df['sentence'].str.split().apply(len).max()]
df_labels = ['Average', 'Shortest', 'Longest']
sns.barplot(x=df_labels, y=df_lengths, palette="Greens_d")
plt.title('Sentence Lengths from DataFrame')
plt.ylabel('Number of Words')
plt.show()
Average sentence length: 7.540958572394686 Median sentence length: 6.0 Standard deviation of sentence length: 3.871960425939225 Minimum sentence length: 3 Maximum sentence length: 30
Numbers converted to words: 4: four 24: twenty four 1972: one thousand, nine hundred and seventy two 1: one 20: twenty 2: two 2012: two thousand and twelve 110: one hundred and ten 6: six 14: fourteen 31: thirty one 1964: one thousand, nine hundred and sixty four 1965: one thousand, nine hundred and sixty five 69: sixty nine 1: one 1966: one thousand, nine hundred and sixty six 67: sixty seven 1: one 10: ten 230: two hundred and thirty 1968: one thousand, nine hundred and sixty eight 1969: one thousand, nine hundred and sixty nine 80: eighty 17: seventeen 50: fifty 60: sixty 200: two hundred 180: one hundred and eighty 10: ten 20: twenty 195: one hundred and ninety five 249: two hundred and forty nine 300: three hundred 500: five hundred 350: three hundred and fifty 25: twenty five 20: twenty 65: sixty five 10: ten 2005: two thousand and five 000: zero 50: fifty 1: one 40: forty 60: sixty 23: twenty three 5: five 10: ten 10: ten 000: zero 180: one hundred and eighty 54: fifty four 1: one 300: three hundred 45: forty five 50: fifty 5: five 20: twenty 80: eighty 10: ten 100: one hundred 3: three 10: ten 21: twenty one 21: twenty one 400: four hundred 5: five 40: forty 35: thirty five 5: five 1: one 15: fifteen 250: two hundred and fifty 15: fifteen 25: twenty five 27: twenty seven 60: sixty 4: four 500: five hundred 6: six 6: six 20: twenty 68: sixty eight 7: seven 24: twenty four 2: two 3: three 2000: two thousand 40: forty 39: thirty nine 1940: one thousand, nine hundred and forty 75: seventy five 000: zero 100: one hundred 1: one 800: eight hundred 108: one hundred and eight 012: twelve 2012: two thousand and twelve 20: twenty 10: ten 85: eighty five 31: thirty one 300: three hundred 9: nine 3: three 150: one hundred and fifty 20: twenty 500: five hundred 000: zero 100: one hundred 1734: one thousand, seven hundred and thirty four 147: one hundred and forty seven 85: eighty five 55: fifty five 14: fourteen 15: fifteen 80: eighty 000: zero 90: ninety 200: two hundred 25: twenty five 300: three hundred 75: seventy five 10: ten 5: five 22: twenty two 150: one hundred and fifty 45: forty five 2001: two thousand and one 10: ten 6: six 300: three hundred 12: twelve 185: one hundred and eighty five 80: eighty 14: fourteen 30: thirty 40: forty 29: twenty nine 14: fourteen 300: three hundred 1: one 810: eight hundred and ten 18: eighteen 18: eighteen 40: forty 60: sixty 35: thirty five 000: zero 000: zero 9: nine 15: fifteen 17: seventeen 12: twelve 25: twenty five 30: thirty 21: twenty one 300: three hundred 10: ten 2009: two thousand and nine 45: forty five 30: thirty 59: fifty nine 7: seven 24: twenty four 5: five 000: zero 30: thirty 86: eighty six 32: thirty two 10: ten 5: five 70: seventy 100: one hundred 50: fifty 28: twenty eight 65: sixty five 45: forty five 30: thirty 40: forty 000: zero 1642: one thousand, six hundred and forty two 65: sixty five 12: twelve 1527: one thousand, five hundred and twenty seven 150: one hundred and fifty 250: two hundred and fifty 60: sixty 1900: one thousand, nine hundred 140: one hundred and forty 10: ten 12: twelve 13: thirteen 35: thirty five 12: twelve 40: forty 50: fifty 50: fifty 50: fifty 9: nine 5: five 1: one 125: one hundred and twenty five 5: five 28: twenty eight 24: twenty four 000: zero 18: eighteen 78: seventy eight 100: one hundred 10: ten 100: one hundred 18: eighteen 25: twenty five 1979: one thousand, nine hundred and seventy nine 33: thirty three 100: one hundred 100: one hundred 50: fifty 80: eighty 20: twenty 30: thirty 1: one 1709: one thousand, seven hundred and nine 1710: one thousand, seven hundred and ten 30: thirty 50: fifty 400: four hundred 218: two hundred and eighteen 97: ninety seven 24: twenty four 28: twenty eight 4: four 520: five hundred and twenty 32: thirty two 2013: two thousand and thirteen 2013: two thousand and thirteen 10: ten 160: one hundred and sixty 250: two hundred and fifty 2012: two thousand and twelve 78: seventy eight 2010: two thousand and ten 1980: one thousand, nine hundred and eighty 600: six hundred 500: five hundred 1: one 000: zero 30: thirty 400: four hundred 10: ten 200: two hundred 27: twenty seven 5: five 14: fourteen 90: ninety 25: twenty five 20: twenty 41: forty one 25: twenty five 100: one hundred 150: one hundred and fifty 20: twenty 30: thirty 25: twenty five 60: sixty 41: forty one 40: forty 47: forty seven 20: twenty 50: fifty 60: sixty 50: fifty 400: four hundred 1: one 20: twenty 140: one hundred and forty 1: one 80: eighty 20: twenty 95: ninety five 18: eighteen 500: five hundred 40: forty 50: fifty 500: five hundred 1: one 2015: two thousand and fifteen 100: one hundred 2: two 190: one hundred and ninety 400: four hundred 5: five 10: ten 50: fifty 5: five 36: thirty six 000: zero 67: sixty seven 100: one hundred 300: three hundred 125: one hundred and twenty five 30: thirty 11: eleven 16: sixteen 24: twenty four 2: two 4: four 3: three 30: thirty 2010: two thousand and ten 2010: two thousand and ten 2011: two thousand and eleven 1: one 20: twenty 40: forty 40: forty 8: eight 30: thirty 1947: one thousand, nine hundred and forty seven 60: sixty 5: five 60: sixty 12: twelve 12: twelve 1: one 12: twelve 1: one 12: twelve 2: two 2: two 12: twelve 12: twelve 12: twelve 2: two 100: one hundred 50: fifty 100: one hundred 1: one 12: twelve 000: zero 12: twelve 6: six 12: twelve 2000: two thousand 12: twelve 999: nine hundred and ninety nine 14: fourteen 12: twelve 000: zero 4: four 12: twelve 1: one 12: twelve 12: twelve 14: fourteen 12: twelve 000: zero 500: five hundred 500: five hundred 000: zero 10: ten 12: twelve 1: one 12: twelve 12: twelve 300: three hundred 500: five hundred 7: seven 000: zero 12: twelve 12: twelve 2: two 12: twelve 2011: two thousand and eleven 000: zero 14: fourteen 000: zero 63: sixty three 64: sixty four 180: one hundred and eighty 30: thirty 11: eleven 35: thirty five 8: eight 600: six hundred 95: ninety five 000: zero 450: four hundred and fifty 475: four hundred and seventy five 5: five 37: thirty seven 31: thirty one 35: thirty five 65: sixty five 000: zero 20: twenty 1835: one thousand, eight hundred and thirty five 54: fifty four 67: sixty seven 000: zero 000: zero 32: thirty two 000: zero 30: thirty 40: forty 58: fifty eight 000: zero 960: nine hundred and sixty 24: twenty four 27: twenty seven 000: zero 32: thirty two 89: eighty nine 70: seventy 48: forty eight 000: zero 62: sixty two 58: fifty eight 000: zero 12: twelve 69: sixty nine 160: one hundred and sixty 145: one hundred and forty five 500: five hundred 100: one hundred 93: ninety three 500: five hundred 30: thirty 50: fifty 1: one 13: thirteen 200: two hundred 000: zero 6: six 600: six hundred 63: sixty three 400: four hundred 60: sixty 1: one 1: one 000: zero 1: one 1: one 300: three hundred 25: twenty five 000: zero 24: twenty four 90: ninety 110: one hundred and ten 1935: one thousand, nine hundred and thirty five 30: thirty 50: fifty 800: eight hundred 100: one hundred 150: one hundred and fifty 250: two hundred and fifty 22: twenty two 200: two hundred 35: thirty five 2012: two thousand and twelve 1851: one thousand, eight hundred and fifty one 10: ten 24: twenty four 12: twelve 1943: one thousand, nine hundred and forty three 60: sixty 617: six hundred and seventeen 1533: one thousand, five hundred and thirty three 200: two hundred 1537: one thousand, five hundred and thirty seven 56: fifty six 26: twenty six 14: fourteen 2012: two thousand and twelve 20: twenty 30: thirty 21: twenty one 99: ninety nine 15: fifteen 12: twelve 900: nine hundred 1: one 1940: one thousand, nine hundred and forty 53: fifty three 400: four hundred 40: forty 1: one 2: two 52: fifty two 28: twenty eight 25: twenty five 48: forty eight 2008: two thousand and eight 60: sixty 75: seventy five 80: eighty 8: eight 5: five 100: one hundred 62: sixty two 12: twelve 11: eleven 371: three hundred and seventy one 371: three hundred and seventy one 000: zero 11: eleven 80: eighty 23: twenty three 40: forty 000: zero 6: six 2003: two thousand and three 93: ninety three 6: six 20: twenty 2003: two thousand and three 205: two hundred and five 69: sixty nine 33: thirty three 000: zero 59: fifty nine 56: fifty six 000: zero 40: forty 15: fifteen 70: seventy 50: fifty 60: sixty 29: twenty nine 45: forty five 80: eighty 100: one hundred 40: forty 8: eight 30: thirty 1: one 12: twelve 000: zero 55: fifty five 000: zero 10: ten 135: one hundred and thirty five 160: one hundred and sixty 110: one hundred and ten 90: ninety 15: fifteen 150: one hundred and fifty 16: sixteen 170: one hundred and seventy 20: twenty 54: fifty four 195: one hundred and ninety five 10: ten 450: four hundred and fifty 20: twenty 450: four hundred and fifty 400: four hundred 1: one 12: twelve 8: eight 200: two hundred 2: two 1: one 2010: two thousand and ten 25: twenty five 46: forty six 000: zero 62: sixty two 59: fifty nine 1995: one thousand, nine hundred and ninety five 000: zero 51: fifty one 56: fifty six 000: zero 000: zero 24: twenty four 66: sixty six 36: thirty six 57: fifty seven 000: zero 200: two hundred 1: one 76: seventy six 40: forty 20: twenty 16: sixteen 2012: two thousand and twelve 25: twenty five 50: fifty 14: fourteen 2013: two thousand and thirteen 61: sixty one 22: twenty two 11: eleven 66: sixty six 67: sixty seven 1846: one thousand, eight hundred and forty six 75: seventy five 120: one hundred and twenty 75: seventy five 5: five 5: five 11: eleven 3: three 50: fifty 2013: two thousand and thirteen 20: twenty 40: forty 3: three 35: thirty five 24: twenty four 12: twelve 10: ten 18: eighteen 18: eighteen 65: sixty five 100: one hundred 50: fifty 57: fifty seven 300: three hundred 50: fifty 8: eight 75: seventy five 30: thirty 50: fifty 28: twenty eight 10: ten 20: twenty 400: four hundred 500: five hundred 150: one hundred and fifty 7: seven 18: eighteen 26: twenty six 1984: one thousand, nine hundred and eighty four 462: four hundred and sixty two 2014: two thousand and fourteen 72: seventy two 1820: one thousand, eight hundred and twenty 2: two 1754: one thousand, seven hundred and fifty four 10: ten 10: ten 60: sixty 75: seventy five 94: ninety four 50: fifty 100: one hundred 370: three hundred and seventy 40: forty 50: fifty 51: fifty one 200: two hundred 1: one 70: seventy 100: one hundred 200: two hundred 1893: one thousand, eight hundred and ninety three 1991: one thousand, nine hundred and ninety one 100: one hundred 25: twenty five 100: one hundred 30: thirty 10: ten 1: one 200: two hundred 300: three hundred 4: four 120: one hundred and twenty 180: one hundred and eighty 230: two hundred and thirty 1946: one thousand, nine hundred and forty six 20: twenty 200: two hundred 400: four hundred 200: two hundred 500: five hundred 800: eight hundred 10: ten 20: twenty 200: two hundred 70: seventy 10: ten 30: thirty 10: ten 2: two 150: one hundred and fifty 31: thirty one 30: thirty 50: fifty 20: twenty 30: thirty 40: forty 10: ten 500: five hundred 12: twelve 30: thirty 25: twenty five 30: thirty 139: one hundred and thirty nine 25: twenty five 10: ten 20: twenty 638: six hundred and thirty eight 13: thirteen 10: ten 12: twelve 12: twelve 28: twenty eight 35: thirty five 60: sixty 100: one hundred 5: five 40: forty 12: twelve 25: twenty five 25: twenty five 9: nine 5: five 15: fifteen 25: twenty five 000: zero 150: one hundred and fifty 100: one hundred 150: one hundred and fifty 100: one hundred 150: one hundred and fifty 30: thirty 45: forty five 930: nine hundred and thirty 9: nine 4: four 1: one 48: forty eight 400: four hundred 2: two 707: seven hundred and seven 707: seven hundred and seven 50: fifty 2: two 50: fifty 2: two 350: three hundred and fifty 14: fourteen 24: twenty four 25: twenty five 30: thirty 12: twelve 50: fifty 2: two 975: nine hundred and seventy five 300: three hundred 200: two hundred 2: two 600: six hundred 000: zero 5: five 12: twelve 45: forty five 120: one hundred and twenty 800: eight hundred 1: one 38: thirty eight 1: one 2012: two thousand and twelve 1: one 30: thirty 50: fifty 60: sixty 5: five 300: three hundred 200: two hundred 1947: one thousand, nine hundred and forty seven 52: fifty two 90: ninety 2000: two thousand 40: forty 100: one hundred 000: zero 95: ninety five 40: forty 14: fourteen 2: two 79: seventy nine 45: forty five 100: one hundred 500: five hundred 600: six hundred 100: one hundred 2: two 194: one hundred and ninety four 15: fifteen 1779: one thousand, seven hundred and seventy nine 60: sixty 25: twenty five 200: two hundred 250: two hundred and fifty 30: thirty 600: six hundred 100: one hundred 30: thirty 50: fifty 80: eighty 65: sixty five 25: twenty five 20: twenty 180: one hundred and eighty 250: two hundred and fifty 50: fifty 5: five 000: zero 10: ten 200: two hundred 1800: one thousand, eight hundred 1910: one thousand, nine hundred and ten 10: ten 12: twelve 75: seventy five 100: one hundred 15: fifteen 70: seventy 60: sixty 99: ninety nine 45: forty five 5: five 000: zero 40: forty 60: sixty 12: twelve 15: fifteen 400: four hundred 600: six hundred 1: one 853: eight hundred and fifty three 46: forty six 500: five hundred 25: twenty five 1888: one thousand, eight hundred and eighty eight 250: two hundred and fifty 10: ten 5: five 1958: one thousand, nine hundred and fifty eight 85: eighty five 600: six hundred 800: eight hundred 2006: two thousand and six 15: fifteen 200: two hundred 300: three hundred 38: thirty eight 28: twenty eight 100: one hundred 60: sixty 300: three hundred 300: three hundred 75: seventy five 1899: one thousand, eight hundred and ninety nine 300: three hundred 2: two 200: two hundred 21: twenty one 100: one hundred 40: forty 125: one hundred and twenty five 24: twenty four 50: fifty 23: twenty three 13: thirteen 10: ten 120: one hundred and twenty 50: fifty 38: thirty eight 60: sixty 30: thirty 150: one hundred and fifty 12: twelve 2: two 32: thirty two 2012: two thousand and twelve 800: eight hundred 1907: one thousand, nine hundred and seven 50: fifty 50: fifty 35: thirty five 25: twenty five 1: one 90: ninety 450: four hundred and fifty 900: nine hundred 400: four hundred 500: five hundred 750: seven hundred and fifty 58: fifty eight 370: three hundred and seventy 42: forty two 10: ten 150: one hundred and fifty 50: fifty 1901: one thousand, nine hundred and one 5: five 10: ten 30: thirty 20: twenty 4: four 1985: one thousand, nine hundred and eighty five 175: one hundred and seventy five 12: twelve 11: eleven 60: sixty 12: twelve 100: one hundred 100: one hundred 500: five hundred 11: eleven 100: one hundred 300: three hundred 50: fifty 48: forty eight 30: thirty 1924: one thousand, nine hundred and twenty four 160: one hundred and sixty 80: eighty 24: twenty four 15: fifteen 600: six hundred 2: two 1: one 20: twenty 40: forty 50: fifty 15: fifteen 10: ten 10: ten 25: twenty five 70: seventy 20: twenty 100: one hundred 200: two hundred 15: fifteen 34: thirty four 20: twenty 20: twenty 26: twenty six 3: three 32: thirty two 20: twenty 5: five 60: sixty 800: eight hundred 100: one hundred 20: twenty 65: sixty five 50: fifty 20: twenty 10: ten 150: one hundred and fifty 2: two 63: sixty three 33: thirty three 100: one hundred 000: zero 000: zero 500: five hundred 90: ninety 50: fifty 20: twenty 1900: one thousand, nine hundred 10: ten 200: two hundred 17: seventeen 30: thirty 24: twenty four 120: one hundred and twenty 100: one hundred 100: one hundred 1: one 120: one hundred and twenty 27: twenty seven 1934: one thousand, nine hundred and thirty four 673: six hundred and seventy three 29: twenty nine 30: thirty 6: six 600: six hundred 200: two hundred 62: sixty two 100: one hundred 3: three 180: one hundred and eighty 142: one hundred and forty two 100: one hundred 1958: one thousand, nine hundred and fifty eight 25: twenty five 16: sixteen 300: three hundred 400: four hundred 000: zero 30: thirty 10: ten 12: twelve 24: twenty four 300: three hundred 40: forty 80: eighty 400: four hundred 8: eight 200: two hundred 300: three hundred 800: eight hundred 12: twelve 000: zero 20: twenty 7: seven 0: zero 40: forty 40: forty 75: seventy five 20: twenty 4: four 4: four 29: twenty nine 1770: one thousand, seven hundred and seventy 000: zero 627: six hundred and twenty seven 465: four hundred and sixty five 375: three hundred and seventy five 385: three hundred and eighty five 3: three 2011: two thousand and eleven 1942: one thousand, nine hundred and forty two 60: sixty 7: seven 18: eighteen 000: zero 2001: two thousand and one 11: eleven 8: eight 5: five 180: one hundred and eighty 30: thirty 75: seventy five 1993: one thousand, nine hundred and ninety three 1978: one thousand, nine hundred and seventy eight 18: eighteen 20: twenty 3: three 1: one 24: twenty four 20: twenty 24: twenty four 150: one hundred and fifty 15: fifteen 1850: one thousand, eight hundred and fifty 1035: one thousand and thirty five 100: one hundred 000: zero 20: twenty 1887: one thousand, eight hundred and eighty seven 26: twenty six 15: fifteen 155: one hundred and fifty five 30: thirty 240: two hundred and forty 15: fifteen 115: one hundred and fifteen 10: ten 1: one 10: ten 94: ninety four 24: twenty four 2: two 2: two 300: three hundred 450: four hundred and fifty 65: sixty five 100: one hundred 100: one hundred 195: one hundred and ninety five 300: three hundred 165: one hundred and sixty five 37: thirty seven 1: one 28: twenty eight 1814: one thousand, eight hundred and fourteen 200: two hundred 100: one hundred 75: seventy five 100: one hundred 450: four hundred and fifty 32: thirty two 20: twenty 246: two hundred and forty six 270: two hundred and seventy 400: four hundred 125: one hundred and twenty five 380: three hundred and eighty 125: one hundred and twenty five 200: two hundred 250: two hundred and fifty 21: twenty one 1: one 450: four hundred and fifty 40: forty 15: fifteen 170: one hundred and seventy 15: fifteen 700: seven hundred 9: nine 200: two hundred 400: four hundred 75: seventy five 600: six hundred 300: three hundred 170: one hundred and seventy 10: ten 2014: two thousand and fourteen 000: zero 3: three 000: zero 30: thirty 1948: one thousand, nine hundred and forty eight 15: fifteen 50: fifty 20: twenty 18: eighteen 18: eighteen 32: thirty two 100: one hundred 1: one 500: five hundred 1338: one thousand, three hundred and thirty eight 12: twelve 24: twenty four 000: zero 10: ten 1545: one thousand, five hundred and forty five 25: twenty five 2008: two thousand and eight 500: five hundred 112: one hundred and twelve 16: sixteen 35: thirty five 500: five hundred 10: ten 4: four 700: seven hundred 17: seventeen 17: seventeen 2: two 8: eight 17: seventeen 1: one 100: one hundred 2014: two thousand and fourteen 515: five hundred and fifteen 210: two hundred and ten 125: one hundred and twenty five 550: five hundred and fifty 625: six hundred and twenty five 430: four hundred and thirty 150: one hundred and fifty 15: fifteen 75: seventy five 190: one hundred and ninety 133: one hundred and thirty three 295: two hundred and ninety five 37: thirty seven 000: zero 65: sixty five 425: four hundred and twenty five 147: one hundred and forty seven 8: eight 10: ten 62: sixty two 110: one hundred and ten 15: fifteen 20: twenty 20: twenty 21: twenty one 14: fourteen 3: three 25: twenty five 30: thirty 100: one hundred 110: one hundred and ten 800: eight hundred 200: two hundred 000: zero 20: twenty 1: one 400: four hundred 12: twelve 24: twenty four 6000: six thousand 135: one hundred and thirty five 20: twenty 30: thirty 50: fifty 15: fifteen 40: forty 000: zero 1905: one thousand, nine hundred and five 100: one hundred 3: three 100: one hundred 100: one hundred 50: fifty 2014: two thousand and fourteen 1984: one thousand, nine hundred and eighty four 2014: two thousand and fourteen 14: fourteen 87: eighty seven 3: three 20: twenty 65: sixty five 170: one hundred and seventy 11: eleven 2001: two thousand and one 96: ninety six 20: twenty 1945: one thousand, nine hundred and forty five 1965: one thousand, nine hundred and sixty five 1739: one thousand, seven hundred and thirty nine 28: twenty eight 161: one hundred and sixty one 35: thirty five 000: zero 14: fourteen 70: seventy 10: ten 20: twenty 40: forty 1606: one thousand, six hundred and six 10: ten 20: twenty 25: twenty five 300: three hundred 500: five hundred 200: two hundred 1996: one thousand, nine hundred and ninety six 95: ninety five 18: eighteen 59: fifty nine 23: twenty three 18: eighteen 12: twelve 21: twenty one 1976: one thousand, nine hundred and seventy six 25: twenty five 9: nine 458: four hundred and fifty eight 40: forty 11: eleven 15: fifteen 11: eleven 75: seventy five 15: fifteen 15: fifteen 20: twenty 1: one 10: ten 20: twenty 40: forty 85: eighty five 35: thirty five 5: five 1949: one thousand, nine hundred and forty nine 20: twenty 40: forty 11: eleven 70: seventy 30: thirty 000: zero 50: fifty 000: zero 65: sixty five 2016: two thousand and sixteen 28: twenty eight 100: one hundred 150: one hundred and fifty 2050: two thousand and fifty 100: one hundred 1963: one thousand, nine hundred and sixty three 20: twenty 80: eighty 100: one hundred 20: twenty 17: seventeen 000: zero 300: three hundred 46: forty six 700: seven hundred 1527: one thousand, five hundred and twenty seven 82: eighty two 60: sixty 18: eighteen 20: twenty 55: fifty five 1958: one thousand, nine hundred and fifty eight 16: sixteen 13: thirteen 12: twelve 000: zero 100: one hundred 14: fourteen 30: thirty 155: one hundred and fifty five 50: fifty 22: twenty two 000: zero 000: zero 40: forty 100: one hundred 150: one hundred and fifty 52: fifty two 11: eleven 24: twenty four 300: three hundred 400: four hundred 30: thirty 40: forty 30: thirty 27: twenty seven 46: forty six 300: three hundred 10: ten 4: four 7: seven 40: forty 5: five 5: five 80: eighty 80: eighty 150: one hundred and fifty 76: seventy six 20: twenty 2013: two thousand and thirteen 40: forty 20: twenty 930: nine hundred and thirty 1800: one thousand, eight hundred 680: six hundred and eighty 500: five hundred 80: eighty 90: ninety 95: ninety five 50: fifty 11: eleven 24: twenty four 300: three hundred 3: three 000: zero 89: eighty nine 20: twenty 40: forty 5: five 191: one hundred and ninety one 17: seventeen 1963: one thousand, nine hundred and sixty three 40: forty 60: sixty 40: forty 50: fifty 4: four 96: ninety six 0: zero 150: one hundred and fifty 180: one hundred and eighty 5: five 200: two hundred 300: three hundred 26: twenty six 11: eleven 15: fifteen 20: twenty 200: two hundred 120: one hundred and twenty 1: one 20: twenty 90: ninety 100: one hundred 200: two hundred 300: three hundred 100: one hundred 10: ten 20: twenty 82: eighty two 20: twenty 200: two hundred 100: one hundred 200: two hundred 100: one hundred 30: thirty 60: sixty 200: two hundred 200: two hundred 3: three 97: ninety seven 5: five 1: one 5: five 85: eighty five 40: forty 15: fifteen 20: twenty 2: two 4: four 2: two 500: five hundred 20: twenty 20: twenty 25: twenty five 30: thirty 27: twenty seven 000: zero 50: fifty 12: twelve 1: one 12: twelve 1: one 12: twelve 30: thirty 1: one 2: two 000: zero 520: five hundred and twenty 30: thirty 18: eighteen 18: eighteen 16: sixteen 18: eighteen 15: fifteen 20: twenty 7: seven 460: four hundred and sixty 1: one 30: thirty 3: three 2: two 12: twelve 2022: two thousand and twenty two 63: sixty three 3: three 15: fifteen 10: ten 2010: two thousand and ten 1923: one thousand, nine hundred and twenty three 11: eleven 25: twenty five 21: twenty one 100: one hundred 140: one hundred and forty 150: one hundred and fifty 200: two hundred 90: ninety 75: seventy five 20: twenty 16: sixteen 1723: one thousand, seven hundred and twenty three 150: one hundred and fifty 100: one hundred 200: two hundred 50: fifty 100: one hundred 90: ninety 200: two hundred 300: three hundred 30: thirty 24: twenty four 5: five 000: zero 000: zero 1: one 70: seventy 2: two 84: eighty four 1988: one thousand, nine hundred and eighty eight 15: fifteen 150: one hundred and fifty 29: twenty nine 38: thirty eight 1991: one thousand, nine hundred and ninety one 40: forty 55: fifty five 15: fifteen 20: twenty 47: forty seven 28: twenty eight 30: thirty 25: twenty five 98: ninety eight 230: two hundred and thirty 11: eleven 30: thirty 28: twenty eight 1605: one thousand, six hundred and five 40: forty 16: sixteen 80: eighty 120: one hundred and twenty 8: eight 60: sixty 20: twenty 30: thirty 1989: one thousand, nine hundred and eighty nine 20: twenty 18: eighteen 1987: one thousand, nine hundred and eighty seven 1926: one thousand, nine hundred and twenty six 13: thirteen 14: fourteen 250: two hundred and fifty 150: one hundred and fifty 400: four hundred 500: five hundred 120: one hundred and twenty 40: forty 40: forty 7: seven 9: nine 12: twelve 30: thirty 90: ninety 15: fifteen 150: one hundred and fifty 100: one hundred 180: one hundred and eighty 000: zero 175: one hundred and seventy five 150: one hundred and fifty 1828: one thousand, eight hundred and twenty eight 70: seventy 56: fifty six 10: ten 15: fifteen 20: twenty 130: one hundred and thirty 15: fifteen 20: twenty 12: twelve 330: three hundred and thirty 400: four hundred 100: one hundred 100: one hundred 100: one hundred 1911: one thousand, nine hundred and eleven 10: ten 340: three hundred and forty 75: seventy five 60: sixty 1828: one thousand, eight hundred and twenty eight 70: seventy 14: fourteen 15: fifteen 1651: one thousand, six hundred and fifty one 95: ninety five 20: twenty 50: fifty 60: sixty 520: five hundred and twenty 750: seven hundred and fifty 1948: one thousand, nine hundred and forty eight 800: eight hundred 20: twenty 25: twenty five 25: twenty five 60: sixty 500: five hundred 000: zero 58: fifty eight 20: twenty 750: seven hundred and fifty 90: ninety 10: ten 15: fifteen 1: one 12: twelve 000: zero 000: zero 20: twenty 24: twenty four 34: thirty four 729: seven hundred and twenty nine 20: twenty 10: ten 100: one hundred 130: one hundred and thirty 26: twenty six 000: zero 800: eight hundred 37: thirty seven 34: thirty four 90: ninety 15: fifteen 465: four hundred and sixty five 25: twenty five 000: zero 800: eight hundred 15: fifteen 3: three 50: fifty 2: two 18: eighteen 400: four hundred 415: four hundred and fifteen 30: thirty 19: nineteen 23: twenty three 400: four hundred 1706: one thousand, seven hundred and six 16: sixteen 20: twenty 200: two hundred 21: twenty one 500: five hundred 125: one hundred and twenty five 45: forty five 28: twenty eight 29: twenty nine 54: fifty four 16: sixteen 6: six 60: sixty 20: twenty 67: sixty seven 22: twenty two 60: sixty 000: zero 15: fifteen 200: two hundred 5: five 600: six hundred 15: fifteen 1933: one thousand, nine hundred and thirty three 10: ten 40: forty 90: ninety 000: zero 100: one hundred 000: zero 55: fifty five 400: four hundred 30: thirty 000: zero 246: two hundred and forty six 68: sixty eight 30: thirty 135: one hundred and thirty five 160: one hundred and sixty 175: one hundred and seventy five 000: zero 000: zero 2006: two thousand and six 57: fifty seven 185: one hundred and eighty five 125: one hundred and twenty five 205: two hundred and five 950: nine hundred and fifty 60: sixty 125: one hundred and twenty five 450: four hundred and fifty 385: three hundred and eighty five 5: five 7: seven 3: three 31: thirty one 80: eighty 69: sixty nine 8: eight 000: zero 350: three hundred and fifty 400: four hundred 550: five hundred and fifty 600: six hundred 650: six hundred and fifty 000: zero 000: zero 165: one hundred and sixty five 15: fifteen 90: ninety 20: twenty 10: ten 85: eighty five 100: one hundred 2: two 120: one hundred and twenty 13: thirteen 260: two hundred and sixty 27: twenty seven 61: sixty one 300: three hundred 165: one hundred and sixty five 175: one hundred and seventy five 50: fifty 225: two hundred and twenty five 35: thirty five 35: thirty five 42: forty two 500: five hundred 70: seventy 700: seven hundred 245: two hundred and forty five 90: ninety 120: one hundred and twenty 200: two hundred 65: sixty five 50: fifty 2: two 60: sixty 8: eight 425: four hundred and twenty five 225: two hundred and twenty five 1: one 200: two hundred 99: ninety nine 50: fifty 000: zero 400: four hundred 500: five hundred 100: one hundred 300: three hundred 500: five hundred 99: ninety nine 3: three 1: one 35: thirty five 270: two hundred and seventy 32: thirty two 32: thirty two 30: thirty 20: twenty 130: one hundred and thirty 300: three hundred 23: twenty three 12: twelve 22: twenty two 20: twenty 22: twenty two 29: twenty nine 000: zero 60: sixty 10: ten 15: fifteen 239: two hundred and thirty nine 14: fourteen 250: two hundred and fifty 16: sixteen 350: three hundred and fifty 370: three hundred and seventy 350: three hundred and fifty 000: zero 99: ninety nine 53: fifty three 5: five 75: seventy five 40: forty 40: forty 30: thirty 2017: two thousand and seventeen 2: two 16: sixteen 18: eighteen 5: five 10: ten 20: twenty 6: six 15: fifteen 20: twenty 000: zero 1918: one thousand, nine hundred and eighteen 1923: one thousand, nine hundred and twenty three 53: fifty three 55: fifty five 130: one hundred and thirty 12: twelve 69: sixty nine 160: one hundred and sixty 36: thirty six 200: two hundred 100: one hundred 150: one hundred and fifty 95: ninety five 9: nine 155: one hundred and fifty five 10: ten 25: twenty five 30: thirty 235: two hundred and thirty five 125: one hundred and twenty five 110: one hundred and ten 1981: one thousand, nine hundred and eighty one 12: twelve 50: fifty 10: ten 1: one 1967: one thousand, nine hundred and sixty seven 90: ninety 400: four hundred 40: forty 4: four 000: zero 50: fifty 30: thirty 100: one hundred 25: twenty five 8: eight 1745: one thousand, seven hundred and forty five 000: zero 8: eight 5: five 2016: two thousand and sixteen 10: ten 1: one 15: fifteen 80: eighty 35: thirty five 20: twenty 100: one hundred 50: fifty 40: forty 10: ten 10: ten 620: six hundred and twenty 12: twelve 350: three hundred and fifty 500: five hundred 14: fourteen 10: ten 11: eleven 11: eleven 10: ten 647: six hundred and forty seven 101: one hundred and one 30: thirty 200: two hundred 200: two hundred 300: three hundred 140: one hundred and forty 1918: one thousand, nine hundred and eighteen 1961: one thousand, nine hundred and sixty one 1940: one thousand, nine hundred and forty 18: eighteen 250: two hundred and fifty 165: one hundred and sixty five 85: eighty five 42: forty two 820: eight hundred and twenty 44: forty four 250: two hundred and fifty 1759: one thousand, seven hundred and fifty nine 100: one hundred 15: fifteen 60: sixty 20: twenty 30: thirty 23: twenty three 13: thirteen 7: seven 000: zero 000: zero 9000: nine thousand 966: nine hundred and sixty six 100: one hundred 55: fifty five 15: fifteen 500: five hundred 14: fourteen 65: sixty five 96: ninety six 5: five 20: twenty 35: thirty five 200: two hundred 30: thirty 1940: one thousand, nine hundred and forty 15: fifteen 18: eighteen 5: five 30: thirty 20: twenty 100: one hundred 14: fourteen 95: ninety five 4: four 2: two 1960: one thousand, nine hundred and sixty 1963: one thousand, nine hundred and sixty three 57: fifty seven 900: nine hundred 6: six 90: ninety 40: forty 000: zero 2: two 30: thirty 000: zero 2009: two thousand and nine 10: ten 10: ten 40: forty 60: sixty 25: twenty five 35: thirty five 78: seventy eight 1826: one thousand, eight hundred and twenty six 12: twelve 6: six 25: twenty five 27: twenty seven 1: one 300: three hundred 400: four hundred 100: one hundred 60: sixty 425: four hundred and twenty five 000: zero 10: ten 13: thirteen 425: four hundred and twenty five 6: six 100: one hundred 70: seventy 6: six 100: one hundred 17: seventeen 100: one hundred 120: one hundred and twenty 50: fifty 5: five 12: twelve 75: seventy five 10: ten 209: two hundred and nine 50: fifty 260: two hundred and sixty 260: two hundred and sixty 160: one hundred and sixty 14: fourteen 50: fifty 26: twenty six 18: eighteen 8: eight 70: seventy 20: twenty 500: five hundred 1: one 3: three 12: twelve 1878: one thousand, eight hundred and seventy eight 30: thirty 40: forty 6: six 2015: two thousand and fifteen 7: seven 12: twelve 1977: one thousand, nine hundred and seventy seven 30: thirty 1954: one thousand, nine hundred and fifty four 20: twenty 12: twelve 2015: two thousand and fifteen 2: two 54: fifty four 10: ten 24: twenty four 300: three hundred 218: two hundred and eighteen 35: thirty five 1951: one thousand, nine hundred and fifty one 20: twenty 90: ninety 90: ninety 15: fifteen 854: eight hundred and fifty four 1985: one thousand, nine hundred and eighty five 20: twenty 000: zero 30: thirty 3: three 3: three 5: five 49: forty nine 350: three hundred and fifty 100: one hundred 200: two hundred 105: one hundred and five 12: twelve 1: one 5: five 400: four hundred 5: five 1: one 2: two 000: zero 2011: two thousand and eleven 1911: one thousand, nine hundred and eleven 1967: one thousand, nine hundred and sixty seven 15: fifteen 11: eleven 2016: two thousand and sixteen 646: six hundred and forty six 2: two 2: two 1973: one thousand, nine hundred and seventy three 65: sixty five 100: one hundred 150: one hundred and fifty 600: six hundred 400: four hundred 500: five hundred 1994: one thousand, nine hundred and ninety four 17: seventeen 30: thirty 15: fifteen 200: two hundred 300: three hundred 15: fifteen 2016: two thousand and sixteen 50: fifty 50: fifty 100: one hundred 520: five hundred and twenty 150: one hundred and fifty 2300: two thousand, three hundred 24: twenty four 15: fifteen 40: forty 10: ten 200: two hundred 1: one 700: seven hundred 200: two hundred 18: eighteen 15: fifteen 20: twenty 14: fourteen 69: sixty nine 3: three 200: two hundred 25: twenty five 930: nine hundred and thirty 10: ten 2016: two thousand and sixteen 2016: two thousand and sixteen 2016: two thousand and sixteen 180: one hundred and eighty 13: thirteen 7: seven 1852: one thousand, eight hundred and fifty two 1: one 35: thirty five 150: one hundred and fifty 42: forty two 2: two 100: one hundred 70: seventy 100: one hundred 20: twenty 30: thirty 4: four 60: sixty 100: one hundred 100: one hundred 700: seven hundred 20: twenty 000: zero 70: seventy 1: one 20: twenty 65: sixty five 70: seventy 20: twenty 20: twenty 100: one hundred 000: zero 30: thirty 110: one hundred and ten 22: twenty two 24: twenty four 120: one hundred and twenty 10: ten 12: twelve 14: fourteen 5: five 31: thirty one 12: twelve 7: seven 9: nine 2: two 74: seventy four 1: one 2009: two thousand and nine 40: forty 1969: one thousand, nine hundred and sixty nine 1998: one thousand, nine hundred and ninety eight 14: fourteen 1973: one thousand, nine hundred and seventy three 000: zero 100: one hundred 25: twenty five 20: twenty 48: forty eight 17: seventeen 24: twenty four 4: four 000: zero 2: two 55: fifty five 4: four 24: twenty four 28: twenty eight 27: twenty seven 27: twenty seven 30: thirty 30: thirty 12: twelve 600: six hundred 500: five hundred 1835: one thousand, eight hundred and thirty five 22: twenty two 20: twenty 45: forty five 60: sixty 89: eighty nine 40: forty 0: zero 12: twelve 10: ten 1826: one thousand, eight hundred and twenty six 000: zero 5: five 1848: one thousand, eight hundred and forty eight 14: fourteen 16: sixteen 11: eleven 800: eight hundred 300: three hundred 000: zero 100: one hundred 600: six hundred 125: one hundred and twenty five 20: twenty 85: eighty five 40: forty 8: eight 18: eighteen 73: seventy three 40: forty 2: two 1948: one thousand, nine hundred and forty eight 80: eighty 500: five hundred 200: two hundred 400: four hundred 1948: one thousand, nine hundred and forty eight 20: twenty 1790: one thousand, seven hundred and ninety 1984: one thousand, nine hundred and eighty four 2: two 000: zero 400: four hundred 150: one hundred and fifty 15: fifteen 25: twenty five 000: zero 1: one 000: zero 47: forty seven 22: twenty two 5: five 40: forty 100: one hundred 40: forty 10: ten 2: two 1: one 49: forty nine 100: one hundred 350: three hundred and fifty 155: one hundred and fifty five 90: ninety 95: ninety five 000: zero 325: three hundred and twenty five 320: three hundred and twenty 325: three hundred and twenty five 10: ten 20: twenty 15: fifteen 30: thirty 80: eighty 60: sixty 15: fifteen 1: one 10: ten 20: twenty 2: two 1848: one thousand, eight hundred and forty eight 1842: one thousand, eight hundred and forty two 90: ninety 25: twenty five 30: thirty 10: ten
In [ ]:
# Tokenize all sentences in the dataframe
all_tokens = [token for sentence in df['sentence'] for token in nltk.word_tokenize(sentence)]
# Count the frequency of each token
token_counts = Counter(all_tokens)
# Get the top 20 most frequent tokens
common_tokens = token_counts.most_common(20)
# Set a professional color palette and style
sns.set_style("ticks", {"xtick.major.size": 8, "ytick.major.size": 8})
sns.set_context("talk", font_scale=0.8)
color = '#2980B9' # Slightly deeper shade of blue
# Plot
plt.figure(figsize=(13, 12))
# Plotting each bar with the refined color
tokens, frequencies = zip(*common_tokens)
for token, freq in common_tokens:
plt.barh(token, freq, color=color, edgecolor='silver', height=0.7)
plt.text(freq + 10, token, str(freq), va='center', color='black', fontsize=12) # Adjusted annotation
# Refining title and axis labels for a polished look
plt.title('Top 20 Most Frequent Tokens', fontsize=20, fontweight='bold', pad=20)
plt.xlabel('Frequency', fontsize=16)
plt.ylabel('Tokens', fontsize=16)
plt.gca().invert_yaxis() # To display the most frequent token at the top
# Introducing subtle gridlines for better mapping
plt.grid(axis='x', linestyle='--', alpha=0.6)
# Adjusting axis ticks for aesthetics
plt.xticks(fontsize=14)
plt.yticks(fontsize=14)
plt.tight_layout()
plt.show()
In [ ]:
df.head()
Out[ ]:
sentence | |
---|---|
0 | WHEN YOU'RE COOKING CHIPS AT HOME |
1 | THE TRADITIONAL CHIP PAN OFTEN STAYS ON THE SHELF |
2 | THROUGH WHAT THEY CALL A KNIFE BLOCK |
3 | WHICH INVOLVES FIRING A POTATO DOWN A PIPE |
4 | APART FROM THE GOLDEN COLOUR AND THE DELICIOUS... |
In [ ]:
for percentile in [25, 50, 75, 90, 95, 99]:
print(f"{percentile}th percentile:", np.percentile(sentence_lengths, percentile))
unique_words = set(word for sentence in df['sentence'] for word in sentence.split())
print("Total unique words:", len(unique_words))
word_counts = Counter(word for sentence in df['sentence'] for word in sentence.split())
print("Most common words:", word_counts.most_common(10))
print("Least common words:", word_counts.most_common()[:-11:-1])
25th percentile: 5.0 50th percentile: 6.0 75th percentile: 9.0 90th percentile: 13.0 95th percentile: 16.0 99th percentile: 20.0 Total unique words: 17388 Most common words: [('THE', 16538), ('TO', 9609), ('A', 8610), ('AND', 8595), ('OF', 7332), ('I', 5829), ('IT', 5226), ('IN', 5052), ('THAT', 4827), ('YOU', 4757)] Least common words: [('SEIZURES', 1), ('PERSUADERS', 1), ('BANKRUPTING', 1), ('REWROTE', 1), ('FLAWS', 1), ('RHINE', 1), ('BROCKEN', 1), ('CROWDED', 1), ("TROTSKY'S", 1), ('UNISON', 1)]
In [ ]:
# Define a pattern for common contractions
common_contractions_pattern = r"\b(?:[a-zA-Z]+n't|[a-zA-Z]+'ll|[a-zA-Z]+'ve|[a-zA-Z]+'re|[a-zA-Z]+'d|[a-zA-Z]+'s)\b"
# Find common contractions in each line and store them
contractions_counter = Counter()
for line in df['sentence']:
contractions_counter.update(re.findall(common_contractions_pattern, line.lower()))
# Get the most common contractions and their counts
most_common_contractions = contractions_counter.most_common()
# Calculate total contractions found
total_contractions = sum(contractions_counter.values())
most_common_contractions, total_contractions
Out[ ]:
([("it's", 2445), ("that's", 1015), ("don't", 978), ("you're", 522), ("i've", 494), ("we've", 492), ("there's", 422), ("we're", 416), ("they're", 391), ("let's", 358), ("you've", 345), ("can't", 313), ("he's", 312), ("didn't", 258), ("i'll", 201), ("i'd", 187), ("she's", 186), ("what's", 183), ("wasn't", 177), ("doesn't", 163), ("they've", 157), ("we'll", 155), ("wouldn't", 122), ("haven't", 100), ("won't", 98), ("you'll", 89), ("couldn't", 85), ("isn't", 85), ("today's", 80), ("you'd", 76), ("they'll", 68), ("we'd", 68), ("he'd", 53), ("weren't", 49), ("aren't", 41), ("they'd", 41), ("who's", 41), ("it'll", 38), ("here's", 35), ("hadn't", 30), ("year's", 27), ("britain's", 26), ("tonight's", 26), ("world's", 25), ("people's", 23), ("shouldn't", 22), ("everyone's", 21), ("hasn't", 20), ("he'll", 19), ("everybody's", 18), ("would've", 18), ("she'd", 15), ("life's", 12), ("mother's", 11), ("children's", 11), ("father's", 11), ("week's", 11), ("who've", 10), ("someone's", 9), ("wife's", 9), ("women's", 9), ("ain't", 9), ("man's", 9), ("nation's", 9), ("bbc's", 8), ("it'd", 8), ("she'll", 8), ("one's", 7), ("name's", 7), ("weekend's", 7), ("how's", 7), ("dad's", 7), ("night's", 7), ("that'll", 7), ("london's", 6), ("king's", 6), ("mum's", 6), ("where's", 6), ("time's", 6), ("matt's", 5), ("thing's", 5), ("market's", 5), ("weather's", 5), ("everything's", 5), ("there'll", 5), ("paul's", 5), ("bradshaw's", 5), ("queen's", 5), ("daren't", 4), ("europe's", 4), ("boy's", 4), ("country's", 4), ("nature's", 4), ("else's", 4), ("england's", 4), ("men's", 4), ("tv's", 4), ("team's", 4), ("something's", 4), ("somebody's", 4), ("work's", 4), ("phil's", 4), ("webster's", 4), ("shakespeare's", 4), ("peter's", 4), ("month's", 3), ("other's", 3), ("anything's", 3), ("dave's", 3), ("town's", 3), ("city's", 3), ("god's", 3), ("who'd", 3), ("woman's", 3), ("uk's", 3), ("kate's", 3), ("henry's", 3), ("island's", 3), ("county's", 3), ("girl's", 3), ("day's", 3), ("charlie's", 3), ("nobody's", 3), ("david's", 3), ("bid's", 3), ("grandmother's", 3), ("gentleman's", 3), ("tom's", 3), ("tomorrow's", 3), ("harm's", 3), ("edward's", 3), ("hogarth's", 3), ("mustn't", 3), ("brother's", 3), ("family's", 3), ("sun's", 2), ("soldier's", 2), ("should've", 2), ("son's", 2), ("show's", 2), ("christ's", 2), ("lawrence's", 2), ("money's", 2), ("planet's", 2), ("thomas's", 2), ("person's", 2), ("company's", 2), ("majesty's", 2), ("individual's", 2), ("buyer's", 2), ("mistress's", 2), ("george's", 2), ("pam's", 2), ("labour's", 2), ("club's", 2), ("miranda's", 2), ("centurion's", 2), ("john's", 2), ("gourmet's", 2), ("shan't", 2), ("november's", 2), ("spencer's", 2), ("jack's", 2), ("farming's", 2), ("maker's", 2), ("jesus's", 2), ("brand's", 2), ("rhod's", 2), ("mark's", 2), ("there'd", 2), ("when's", 2), ("valentine's", 2), ("whatever's", 2), ("busman's", 2), ("relief's", 2), ("item's", 2), ("oak's", 2), ("lee's", 2), ("georgie's", 2), ("summer's", 2), ("shepherd's", 2), ("nash's", 2), ("animal's", 2), ("alzheimer's", 2), ("doctor's", 2), ("husband's", 2), ("bobby's", 2), ("america's", 2), ("cathedral's", 2), ("gentlemen's", 2), ("tim's", 2), ("could've", 2), ("daddy's", 2), ("mick's", 2), ("emma's", 2), ("yesterday's", 2), ("television's", 2), ("anybody's", 2), ("agency's", 2), ("roscoff's", 2), ("paula's", 2), ("lady's", 2), ("saleroom's", 2), ("pete's", 2), ("goat's", 2), ("gully's", 1), ("sheep's", 1), ("later's", 1), ("barr's", 1), ("gaynor's", 1), ("bar's", 1), ("church's", 1), ("rachel's", 1), ("age's", 1), ("galileo's", 1), ("jennifer's", 1), ("kathy's", 1), ("titchmarsh's", 1), ("century's", 1), ("conqueror's", 1), ("dermot's", 1), ("damien's", 1), ("bohemond's", 1), ("marconi's", 1), ("annie's", 1), ("richard's", 1), ("topography's", 1), ("owner's", 1), ("chief's", 1), ("handler's", 1), ("hunt's", 1), ("government's", 1), ("riding's", 1), ("nhs'll", 1), ("katy's", 1), ("sotheby's", 1), ("eyre's", 1), ("cromwell's", 1), ("spix's", 1), ("nic's", 1), ("dealer's", 1), ("parent's", 1), ("frank's", 1), ("legion's", 1), ("derbyshire's", 1), ("cassini's", 1), ("newborn's", 1), ("garrow's", 1), ("clive's", 1), ("neck's", 1), ("edmund's", 1), ("channel's", 1), ("cartland's", 1), ("howard's", 1), ("bpa's", 1), ("wren's", 1), ("eamonn's", 1), ("daimler's", 1), ("juana's", 1), ("barrow's", 1), ("holly's", 1), ("sue's", 1), ("flavour's", 1), ("so's", 1), ("martin's", 1), ("hancock's", 1), ("smith's", 1), ("mankind's", 1), ("value's", 1), ("phone's", 1), ("eric's", 1), ("gillian's", 1), ("author's", 1), ("victoria's", 1), ("pamela's", 1), ("hour's", 1), ("grandfather's", 1), ("wheatley's", 1), ("jackie's", 1), ("malta's", 1), ("gormley's", 1), ("deer's", 1), ("rate's", 1), ("dunbar's", 1), ("anyone's", 1), ("sande's", 1), ("principle's", 1), ("gordon's", 1), ("julia's", 1), ("think's", 1), ("margaret's", 1), ("gabby's", 1), ("ronnie's", 1), ("baxter's", 1), ("canopy's", 1), ("bird's", 1), ("minton's", 1), ("alexandra's", 1), ("clerk's", 1), ("tb's", 1), ("chemist's", 1), ("fermi's", 1), ("jeanette's", 1), ("macmillan's", 1), ("drake's", 1), ("bottom's", 1), ("watkins's", 1), ("peterborough's", 1), ("linda's", 1), ("churchill's", 1), ("band's", 1), ("liverpool's", 1), ("bretby's", 1), ("auction's", 1), ("kitchener's", 1), ("blacksmith's", 1), ("constantine's", 1), ("justinian's", 1), ("orwell's", 1), ("roadshow's", 1), ("emperor's", 1), ("b's", 1), ("boudicca's", 1), ("part's", 1), ("alan's", 1), ("mortimer's", 1), ("commander's", 1), ("this'll", 1), ("daphne's", 1), ("chris's", 1), ("vicar's", 1), ("teddy's", 1), ("rome's", 1), ("devon's", 1), ("clayton's", 1), ("adam's", 1), ("nottingham's", 1), ("hollywood's", 1), ("andrew's", 1), ("denny's", 1), ("derby's", 1), ("that'd", 1), ("director's", 1), ("driver's", 1), ("ship's", 1), ("pop's", 1), ("sullivan's", 1), ("jamie's", 1), ("betty's", 1), ("dad'll", 1), ("lalique's", 1), ("laura's", 1), ("suzanne's", 1), ("jaguar's", 1), ("kat's", 1), ("kerr's", 1), ("tennyson's", 1), ("past's", 1), ("peacock's", 1), ("cow's", 1), ("parson's", 1), ("caroline's", 1), ("fire's", 1), ("friend's", 1), ("salesmen's", 1), ("darren's", 1), ("original's", 1), ("bernice's", 1), ("empire's", 1), ("marie's", 1), ("saul's", 1), ("canine's", 1), ("charlotte's", 1), ("farm's", 1), ("giant's", 1), ("damian's", 1), ("foxe's", 1), ("barbara's", 1), ("builder's", 1), ("edith's", 1), ("decision's", 1), ("ve'll", 1), ("hamish's", 1), ("tree's", 1), ("mcclintock's", 1), ("prince's", 1), ("cheque's", 1), ("australia's", 1), ("music's", 1), ("russell's", 1), ("hairdresser's", 1), ("lucy's", 1), ("cadbury's", 1), ("water's", 1), ("devil's", 1), ("venue's", 1), ("artist's", 1), ("beard's", 1), ("germany's", 1), ("juliet's", 1), ("player's", 1), ("torrin's", 1), ("hackman's", 1), ("photographer's", 1), ("madeira's", 1), ("monk's", 1), ("trinian's", 1), ("pont's", 1), ("tyler's", 1), ("love's", 1), ("naani's", 1), ("heston's", 1), ("mayor's", 1), ("scotland's", 1), ("chain's", 1), ("philip's", 1), ("tripper's", 1), ("len's", 1), ("building's", 1), ("byron's", 1), ("gear's", 1), ("limestone's", 1), ("mary's", 1), ("asprey's", 1), ("workmen's", 1), ("snake's", 1), ("washington's", 1), ("astley's", 1), ("smart's", 1), ("oakey's", 1), ("castle's", 1), ("miner's", 1), ("kent's", 1), ("story's", 1), ("mexico's", 1), ("collector's", 1), ("pm's", 1), ("fiction's", 1), ("ballard's", 1), ("wilson's", 1), ("gaulle's", 1), ("sony's", 1), ("korea's", 1), ("auctioneer's", 1), ("jessica's", 1), ("donkey's", 1), ("audrey's", 1), ("rodney's", 1), ("sharon's", 1), ("car's", 1), ("relative's", 1), ("france's", 1), ("bloke's", 1), ("catherine's", 1), ("merchant's", 1), ("kathleen's", 1), ("calm's", 1), ("rspb's", 1), ("viii's", 1), ("glitter's", 1), ("hartley's", 1), ("debbie's", 1), ("aim's", 1), ("grandma's", 1), ("heart's", 1), ("bertie's", 1), ("saddle's", 1), ("firm's", 1), ("machine's", 1), ("manor's", 1), ("ted's", 1), ("sunderland's", 1), ("cabot's", 1), ("tot's", 1), ("belfort's", 1), ("fisherman's", 1), ("half's", 1), ("season's", 1), ("frost's", 1), ("client's", 1), ("corvette's", 1), ("people've", 1), ("publisher's", 1), ("cameron's", 1), ("where'd", 1), ("adrian's", 1), ("julie's", 1), ("eve's", 1), ("clarkson's", 1), ("payer's", 1), ("hammer's", 1), ("hepburn's", 1), ("peck's", 1), ("evil's", 1), ("sandy's", 1), ("clare's", 1), ("barry's", 1), ("hitler's", 1), ("leg's", 1), ("spock's", 1), ("poppy's", 1), ("cinema's", 1), ("lord's", 1), ("morsi's", 1), ("incedal's", 1), ("now's", 1), ("generation's", 1), ("community's", 1), ("why've", 1), ("ben's", 1), ("photo's", 1), ("grainger's", 1), ("evening's", 1), ("couple's", 1), ("grace's", 1), ("store's", 1), ("brahms's", 1), ("fox's", 1), ("wellington's", 1), ("forum's", 1), ("property's", 1), ("bathroom's", 1), ("sunday's", 1), ("bill's", 1), ("crew's", 1), ("who'll", 1), ("teacher's", 1), ("justin's", 1), ("there've", 1), ("roman's", 1), ("dante's", 1), ("sailor's", 1), ("eva's", 1), ("monica's", 1), ("jade's", 1), ("mar's", 1), ("moorcroft's", 1), ("jay's", 1), ("military's", 1), ("hitchhiker's", 1), ("pilot's", 1), ("duxford's", 1), ("veteran's", 1), ("ireland's", 1), ("tea's", 1), ("graham's", 1), ("shazia's", 1), ("helen's", 1), ("bishop's", 1), ("beeching's", 1), ("might've", 1), ("jenny's", 1), ("jonathan's", 1), ("monday's", 1), ("control's", 1), ("adele's", 1), ("parkinson's", 1), ("stephen's", 1), ("savile's", 1), ("gilding's", 1), ("owen's", 1), ("professor's", 1), ("olympian's", 1), ("hodgkin's", 1), ("trump's", 1), ("eleanor's", 1), ("craig's", 1), ("alia's", 1), ("ram's", 1), ("college's", 1), ("harrison's", 1), ("pat's", 1), ("sister's", 1), ("practice's", 1), ("madonna's", 1), ("january's", 1), ("museum's", 1), ("madge's", 1), ("rene's", 1), ("reader's", 1), ("brian's", 1), ("flossy's", 1), ("countryfile's", 1), ("kevin's", 1), ("hubble's", 1), ("bang's", 1), ("alexander's", 1), ("aleksandr's", 1), ("moscow's", 1), ("harold's", 1), ("arctic's", 1), ("technology's", 1), ("patient's", 1), ("cbbc's", 1), ("charity's", 1), ("dude's", 1), ("janet's", 1), ("hand's", 1), ("dot's", 1), ("economy's", 1), ("william's", 1), ("sian's", 1), ("braxton's", 1), ("weston's", 1), ("tumour's", 1), ("gina's", 1), ("candidate's", 1), ("must've", 1), ("madeline's", 1), ("diamond's", 1), ("hammock's", 1), ("polo's", 1), ("humanity's", 1), ("maxwell's", 1), ("university's", 1), ("whoever's", 1), ("gregg's", 1), ("trotsky's", 1)], 12608)
In [ ]:
# Calculate word count for sentences
df['word_count'] = df['sentence'].apply(lambda x: len(x.split()))
# Print statistics on word counts
print(df['word_count'].describe())
# Visualization: Histograms of sentence word counts
plt.hist(df['word_count'], bins=20, alpha=0.7)
plt.title('Word Counts in Sentences')
plt.xlabel('Word Count')
plt.ylabel('Frequency')
plt.show()
count 45839.000000 mean 7.231702 std 3.770229 min 3.000000 25% 4.000000 50% 6.000000 75% 9.000000 max 28.000000 Name: word_count, dtype: float64
In [ ]:
# Create inflect engine once
p = inflect.engine()
def convert_numerical_ordinals_to_words(text):
words = text.split()
for i, word in enumerate(words):
# Removing punctuation for better matching
clean_word = word.rstrip(string.punctuation)
if match(r'\d+(st|nd|rd|th)', clean_word):
number = match(r'\d+', clean_word).group()
word_ordinal = p.number_to_words(int(number), ordinal=True, andword=' ', zero='zero', one='one')
# Retain the punctuation after conversion
punctuation = word[len(clean_word):]
word_ordinal += punctuation
words[i] = word_ordinal
return ' '.join(words)
In [ ]:
# Convert any numerical ordinals in the sentences to their word form
df['sentence'] = df['sentence'].apply(convert_numerical_ordinals_to_words)
# Display the first few rows to verify the changes
print(df.head())
sentence word_count 0 WHEN YOU'RE COOKING CHIPS AT HOME 6 1 THE TRADITIONAL CHIP PAN OFTEN STAYS ON THE SHELF 9 2 THROUGH WHAT THEY CALL A KNIFE BLOCK 7 3 WHICH INVOLVES FIRING A POTATO DOWN A PIPE 8 4 APART FROM THE GOLDEN COLOUR AND THE DELICIOUS... 9
In [ ]:
df_original = df.copy(deep=True)
In [ ]:
# Backup the sentences before conversion
df['original_sentence'] = df['sentence'].copy()
# Display a few randomly selected original and converted sentences for comparison
sample_sentences = df.sample(10)
for index, row in sample_sentences.iterrows():
print(f"Original: {row['original_sentence']}")
print(f"Converted: {row['sentence']}")
print("------")
# Compute statistics
df['word_count_after_conversion'] = df['sentence'].apply(lambda x: len(x.split()))
print("\nStatistics after conversion:")
print(df['word_count_after_conversion'].describe())
# Visualization: Histograms of sentence lengths after conversion
plt.hist(df['word_count_after_conversion'], bins=20, alpha=0.7, color='blue', label='After Conversion')
plt.hist(df['word_count'], bins=20, alpha=0.7, color='red', label='Before Conversion')
plt.title('Sentence Lengths Comparison')
plt.xlabel('Length (words)')
plt.ylabel('Frequency')
plt.legend()
plt.show()
Original: CAN WE FORGET ABOUT THE PRICE TAG Converted: CAN WE FORGET ABOUT THE PRICE TAG ------ Original: BUT IT IS A PLEASURE TO SIT THERE AND SEE WHAT TURNS UP Converted: BUT IT IS A PLEASURE TO SIT THERE AND SEE WHAT TURNS UP ------ Original: MY REAL NAME IS BASIL DEVERE COURTNEY Converted: MY REAL NAME IS BASIL DEVERE COURTNEY ------ Original: SO FOR EVERY one hundred Converted: SO FOR EVERY one hundred ------ Original: THEY'RE NOT SECOND HAND OR THIRD HAND Converted: THEY'RE NOT SECOND HAND OR THIRD HAND ------ Original: THERE IS A RARITY FACTOR Converted: THERE IS A RARITY FACTOR ------ Original: WHY DON'T WE HAVE A LOOK AT HOW POOR COLIN YOUNG IS GETTING ON WITH THE BLUE TEAM'S BONUS Converted: WHY DON'T WE HAVE A LOOK AT HOW POOR COLIN YOUNG IS GETTING ON WITH THE BLUE TEAM'S BONUS ------ Original: THE GAME GREW IN POPULARITY Converted: THE GAME GREW IN POPULARITY ------ Original: AS SOON AS THE DATE WAS ANNOUNCED Converted: AS SOON AS THE DATE WAS ANNOUNCED ------ Original: I'D SAY fifty TO eighty Converted: I'D SAY fifty TO eighty ------ Statistics after conversion: count 45839.000000 mean 7.231702 std 3.770229 min 3.000000 25% 4.000000 50% 6.000000 75% 9.000000 max 28.000000 Name: word_count_after_conversion, dtype: float64
In [ ]:
#Create a copy of the dataframe
df_copy = df.copy()
#Compare and create a 'changed' column
df['changed'] = df['sentence'] != df_copy['sentence']
# Obtain statistics
changed_count = df['changed'].sum()
unchanged_count = len(df) - changed_count
print(f"Number of sentences that changed: {changed_count}")
print(f"Number of sentences that remained unchanged: {unchanged_count}")
Number of sentences that changed: 0 Number of sentences that remained unchanged: 45839
In [ ]:
# List to store words that were converted
converted_words_list = []
# Iterate through each row of the dataframe
for index, row in df.iterrows():
original_words = df_copy.loc[index, 'sentence'].split()
converted_words = row['sentence'].split()
for orig, conv in zip(original_words, converted_words):
if orig != conv:
converted_words_list.append((orig, conv))
# Count the occurrence of each conversion
conversion_counter = Counter(converted_words_list)
# Display the most common conversions
common_conversions = conversion_counter.most_common()
print("Most common word conversions:")
for conversion, count in common_conversions:
orig, conv = conversion
print(f"{orig} -> {conv}: {count} times")
Most common word conversions:
In [ ]:
# copy df_expanded to df_before_token
df_before_token = df.copy()
In [ ]:
print(df.head())
sentence word_count \ 0 WHEN YOU'RE COOKING CHIPS AT HOME 6 1 THE TRADITIONAL CHIP PAN OFTEN STAYS ON THE SHELF 9 2 THROUGH WHAT THEY CALL A KNIFE BLOCK 7 3 WHICH INVOLVES FIRING A POTATO DOWN A PIPE 8 4 APART FROM THE GOLDEN COLOUR AND THE DELICIOUS... 9 original_sentence \ 0 WHEN YOU'RE COOKING CHIPS AT HOME 1 THE TRADITIONAL CHIP PAN OFTEN STAYS ON THE SHELF 2 THROUGH WHAT THEY CALL A KNIFE BLOCK 3 WHICH INVOLVES FIRING A POTATO DOWN A PIPE 4 APART FROM THE GOLDEN COLOUR AND THE DELICIOUS... word_count_after_conversion changed 0 6 False 1 9 False 2 7 False 3 8 False 4 9 False
In [ ]:
df['sentence'] = df['sentence'].str.lower()
df.head()
Out[ ]:
sentence | word_count | original_sentence | word_count_after_conversion | changed | |
---|---|---|---|---|---|
0 | when you're cooking chips at home | 6 | WHEN YOU'RE COOKING CHIPS AT HOME | 6 | False |
1 | the traditional chip pan often stays on the shelf | 9 | THE TRADITIONAL CHIP PAN OFTEN STAYS ON THE SHELF | 9 | False |
2 | through what they call a knife block | 7 | THROUGH WHAT THEY CALL A KNIFE BLOCK | 7 | False |
3 | which involves firing a potato down a pipe | 8 | WHICH INVOLVES FIRING A POTATO DOWN A PIPE | 8 | False |
4 | apart from the golden colour and the delicious... | 9 | APART FROM THE GOLDEN COLOUR AND THE DELICIOUS... | 9 | False |
In [ ]:
df.head()
Out[ ]:
sentence | word_count | original_sentence | word_count_after_conversion | changed | |
---|---|---|---|---|---|
0 | when you're cooking chips at home | 6 | WHEN YOU'RE COOKING CHIPS AT HOME | 6 | False |
1 | the traditional chip pan often stays on the shelf | 9 | THE TRADITIONAL CHIP PAN OFTEN STAYS ON THE SHELF | 9 | False |
2 | through what they call a knife block | 7 | THROUGH WHAT THEY CALL A KNIFE BLOCK | 7 | False |
3 | which involves firing a potato down a pipe | 8 | WHICH INVOLVES FIRING A POTATO DOWN A PIPE | 8 | False |
4 | apart from the golden colour and the delicious... | 9 | APART FROM THE GOLDEN COLOUR AND THE DELICIOUS... | 9 | False |
In [ ]:
# Load the CMU Pronunciation Dictionary
pronunciation_dict = cmudict.dict()
# Initialize the g2p converter
g2p = G2p()
def tokenize_and_lowercase_text(text):
"""Tokenize and lowercase text."""
# Replace newline characters with space
text = text.replace('\n', ' ')
# Expand contractions
text = contractions.fix(text)
# Handle decades
text = re.sub(r'(\d+)(s)', r'\1 \2', text)
# Tokenize text
tokens = nltk.word_tokenize(text)
# Lowercase tokens
tokens = [token.lower() for token in tokens]
return tokens
def words_to_phonemes(words):
phonemes = []
for word in words:
if word in ['.', ',', '?', '!', ':', ';']:
phonemes.append('<space>')
else:
if word in pronunciation_dict:
phonemes.extend(pronunciation_dict[word][0])
phonemes.append('<space>')
elif word == "'":
pass
else:
phonemes.extend(g2p(word))
phonemes.append('<space>')
return phonemes
def process_sentence(sentence):
try:
# Tokenize and lowercase text
tokenized_sentence = tokenize_and_lowercase_text(sentence)
# Convert words to phonemes
phonemes = words_to_phonemes(tokenized_sentence)
phonemes = ['<sos>'] + phonemes[:-1] + ['<eos>']
return phonemes
except Exception as e:
print(f"Error processing sentence: {sentence}")
print(e)
return None
def expand_contractions(text):
"""Expand contractions in a text."""
return contractions.fix(text)
# Expand contractions in the sentence column
df['sentence'] = df['sentence'].apply(expand_contractions)
# Then apply the tokenization and phoneme conversion processes as before
with Pool() as pool:
df['phonemes'] = pool.map(process_sentence, df['sentence'])
print(df.head())
# Inspect the data
# Check the sentences where the <space> token is not present or is present less frequently than expected
df['word_count'] = df['sentence'].apply(lambda x: len(x.split()))
df['num_spaces'] = df['phonemes'].apply(lambda x: x.count('<space>'))
unusual_sentences = df[df['num_spaces'] < df['word_count'] - 1]
print(unusual_sentences)
sentence word_count \ 0 when you are cooking chips at home 6 1 the traditional chip pan often stays on the shelf 9 2 through what they call a knife block 7 3 which involves firing a potato down a pipe 8 4 apart from the golden colour and the delicious... 9 original_sentence \ 0 WHEN YOU'RE COOKING CHIPS AT HOME 1 THE TRADITIONAL CHIP PAN OFTEN STAYS ON THE SHELF 2 THROUGH WHAT THEY CALL A KNIFE BLOCK 3 WHICH INVOLVES FIRING A POTATO DOWN A PIPE 4 APART FROM THE GOLDEN COLOUR AND THE DELICIOUS... word_count_after_conversion changed \ 0 6 False 1 9 False 2 7 False 3 8 False 4 9 False phonemes 0 [<sos>, W, EH1, N, <space>, Y, UW1, <space>, A... 1 [<sos>, DH, AH0, <space>, T, R, AH0, D, IH1, S... 2 [<sos>, TH, R, UW1, <space>, W, AH1, T, <space... 3 [<sos>, W, IH1, CH, <space>, IH0, N, V, AA1, L... 4 [<sos>, AH0, P, AA1, R, T, <space>, F, R, AH1,... Empty DataFrame Columns: [sentence, word_count, original_sentence, word_count_after_conversion, changed, phonemes, num_spaces] Index: []
In [ ]:
# Sample 10 random sentences from the dataset
sample_sentences = df['sentence'].sample(10)
token_counts = [len(tokenize_and_lowercase_text(sentence)) for sentence in sample_sentences]
sentence_counts = [len(sentence.split()) for sentence in sample_sentences]
# Bar Chart
index = range(len(sample_sentences))
bar_width = 0.35
fig, ax = plt.subplots(figsize=(12, 6))
bar1 = ax.bar(index, sentence_counts, bar_width, label='Original Word Count', color='#3498DB', edgecolor='black')
bar2 = ax.bar([i + bar_width for i in index], token_counts, bar_width, label='Tokenized Word Count', color='#E74C3C', edgecolor='black')
ax.set_xlabel('Sentences')
ax.set_ylabel('Word Count')
ax.set_title('Comparison of Word Counts Before and After Tokenization')
ax.set_xticks([i + bar_width for i in index])
ax.set_xticklabels(['Sentence ' + str(i+1) for i in index], rotation=45)
ax.legend()
plt.tight_layout()
plt.show()
# Annotated Text Display
for index, sentence in enumerate(sample_sentences[:2]):
tokens = tokenize_and_lowercase_text(sentence)
print(f"Sentences {index+1}:")
print(f"Original: {sentence}")
print(f"Tokenized: {tokens}")
print("-"*100)
Sentences 1: Original: that is always fascinated me Tokenized: ['that', 'is', 'always', 'fascinated', 'me'] ---------------------------------------------------------------------------------------------------- Sentences 2: Original: which means the light comes from hot Tokenized: ['which', 'means', 'the', 'light', 'comes', 'from', 'hot'] ----------------------------------------------------------------------------------------------------
In [ ]:
# Convert the list of phonemes in the 'phonemes' column to a space-separated string
df['phonemes_str'] = df['phonemes'].str.join(' ')
# Create a function to display the dataframe without truncation
def display_full_dataframe(dataframe):
with pd.option_context('display.max_rows', None, 'display.max_columns', None, 'display.max_colwidth', None):
display(dataframe)
# Use the function to display the first 5 rows
display_full_dataframe(df[["phonemes_str"]].head())
phonemes_str | |
---|---|
0 | <sos> W EH1 N <space> Y UW1 <space> AA1 R <space> K UH1 K IH0 NG <space> CH IH1 P S <space> AE1 T <space> HH OW1 M <eos> |
1 | <sos> DH AH0 <space> T R AH0 D IH1 SH AH0 N AH0 L <space> CH IH1 P <space> P AE1 N <space> AO1 F AH0 N <space> S T EY1 Z <space> AA1 N <space> DH AH0 <space> SH EH1 L F <eos> |
2 | <sos> TH R UW1 <space> W AH1 T <space> DH EY1 <space> K AO1 L <space> AH0 <space> N AY1 F <space> B L AA1 K <eos> |
3 | <sos> W IH1 CH <space> IH0 N V AA1 L V Z <space> F AY1 R IH0 NG <space> AH0 <space> P AH0 T EY1 T OW2 <space> D AW1 N <space> AH0 <space> P AY1 P <eos> |
4 | <sos> AH0 P AA1 R T <space> F R AH1 M <space> DH AH0 <space> G OW1 L D AH0 N <space> K AH1 L AW0 R <space> AH0 N D <space> DH AH0 <space> D IH0 L IH1 SH AH0 S <space> F L AE1 V ER0 <eos> |
In [ ]:
df.head()
Out[ ]:
sentence | word_count | original_sentence | word_count_after_conversion | changed | phonemes | num_spaces | phonemes_str | |
---|---|---|---|---|---|---|---|---|
0 | when you are cooking chips at home | 7 | WHEN YOU'RE COOKING CHIPS AT HOME | 6 | False | [<sos>, W, EH1, N, <space>, Y, UW1, <space>, A... | 6 | <sos> W EH1 N <space> Y UW1 <space> AA1 R <spa... |
1 | the traditional chip pan often stays on the shelf | 9 | THE TRADITIONAL CHIP PAN OFTEN STAYS ON THE SHELF | 9 | False | [<sos>, DH, AH0, <space>, T, R, AH0, D, IH1, S... | 8 | <sos> DH AH0 <space> T R AH0 D IH1 SH AH0 N AH... |
2 | through what they call a knife block | 7 | THROUGH WHAT THEY CALL A KNIFE BLOCK | 7 | False | [<sos>, TH, R, UW1, <space>, W, AH1, T, <space... | 6 | <sos> TH R UW1 <space> W AH1 T <space> DH EY1 ... |
3 | which involves firing a potato down a pipe | 8 | WHICH INVOLVES FIRING A POTATO DOWN A PIPE | 8 | False | [<sos>, W, IH1, CH, <space>, IH0, N, V, AA1, L... | 7 | <sos> W IH1 CH <space> IH0 N V AA1 L V Z <spac... |
4 | apart from the golden colour and the delicious... | 9 | APART FROM THE GOLDEN COLOUR AND THE DELICIOUS... | 9 | False | [<sos>, AH0, P, AA1, R, T, <space>, F, R, AH1,... | 8 | <sos> AH0 P AA1 R T <space> F R AH1 M <space> ... |
In [ ]:
# Remove sentences that contain ' or space in the phonemes
df = df[~df['phonemes'].apply(lambda x: "'" in x or ' ' in x)]
In [ ]:
# Create a dictionary for removing stress markers
remove_stress_dict = {str(i): '' for i in range(10)}
def remove_stress(phonemes):
"""Remove stress markers from a list of phonemes."""
return [re.sub(r'\d', '', phoneme) for phoneme in phonemes]
def add_special_tokens(sentence):
"""Add special tokens to a sentence."""
return '<sos> ' + sentence.replace(' ', ' <space> ') + ' <eos>'
# Apply the function to the sentence column
df['sentence_with_tokens'] = df['sentence'].apply(add_special_tokens)
print(df[['sentence', 'sentence_with_tokens', 'phonemes']].sample(10))
# Apply the processing function
df['phonemes'] = df['phonemes'].apply(remove_stress)
# Sample Inspection
print(df[['sentence', 'phonemes']].sample(10))
# Distribution Analysis
df['phoneme_count'] = df['phonemes'].str.len()
print(df['phoneme_count'].describe())
# Special Tokens Check
wrong_start = df[df['phonemes'].str[0] != "<sos>"]
wrong_end = df[df['phonemes'].str[-1] != "<eos>"]
print(f"Number of sequences with wrong start: {len(wrong_start)}")
print(f"Number of sequences with wrong end: {len(wrong_end)}")
# Check for None values
none_sentences = df[df['phonemes'].apply(lambda x: None in x)]
print(f"Number of sentences with None values: {len(none_sentences)}")
# Frequency Analysis
all_phonemes = list(chain.from_iterable(df['phonemes']))
phoneme_freq = Counter(all_phonemes)
print("Most common phonemes:", phoneme_freq.most_common(10))
print("Least common phonemes:", phoneme_freq.most_common()[:-11:-1])
# Check if there are any missing phonemes
missing_phonemes = df[df['phonemes'].apply(lambda x: None in x)]
print(f"Number of sentences with missing phonemes: {len(missing_phonemes)}")
space_sentences = df[df['phonemes'].apply(lambda x: ' ' in x)]
print(space_sentences[['sentence', 'phonemes']])
<ipython-input-50-a357df83680a>:13: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy df['sentence_with_tokens'] = df['sentence'].apply(add_special_tokens)
sentence \ 12522 tried to break it out of my hand 12508 do not you call a doctor 16858 we have got what we wanted 18655 two of our best teams will be competing agains... 1100 if you had clarissa beside you on the barricades 43334 you brought it in 40657 the dead do not come back 11112 by the famous dambusters 25708 if you clear your plate of sandwiches they kee... 17329 how are you today sentence_with_tokens \ 12522 <sos> tried <space> to <space> break <space> i... 12508 <sos> do <space> not <space> you <space> call ... 16858 <sos> we <space> have <space> got <space> what... 18655 <sos> two <space> of <space> our <space> best ... 1100 <sos> if <space> you <space> had <space> clari... 43334 <sos> you <space> brought <space> it <space> i... 40657 <sos> the <space> dead <space> do <space> not ... 11112 <sos> by <space> the <space> famous <space> da... 25708 <sos> if <space> you <space> clear <space> you... 17329 <sos> how <space> are <space> you <space> toda... phonemes 12522 [<sos>, T, R, AY1, D, <space>, T, UW1, <space>... 12508 [<sos>, D, UW1, <space>, N, AA1, T, <space>, Y... 16858 [<sos>, W, IY1, <space>, HH, AE1, V, <space>, ... 18655 [<sos>, T, UW1, <space>, AH1, V, <space>, AW1,... 1100 [<sos>, IH1, F, <space>, Y, UW1, <space>, HH, ... 43334 [<sos>, Y, UW1, <space>, B, R, AO1, T, <space>... 40657 [<sos>, DH, AH0, <space>, D, EH1, D, <space>, ... 11112 [<sos>, B, AY1, <space>, DH, AH0, <space>, F, ... 25708 [<sos>, IH1, F, <space>, Y, UW1, <space>, K, L... 17329 [<sos>, HH, AW1, <space>, AA1, R, <space>, Y, ... sentence \ 21948 when you can actually hold the fabric that the... 4958 there is something about them which makes us f... 18610 bills of sale and receipts 43467 look out for the qr codes 30773 most influential figures in british comedy 7102 i cannot wait to see it and you can find out h... 14877 we are pinning all our hopes on the man with t... 5095 there is lots of smaller 25014 why do not you do the power test 38100 on that assumption phonemes 21948 [<sos>, W, EH, N, <space>, Y, UW, <space>, K, ... 4958 [<sos>, DH, EH, R, <space>, IH, Z, <space>, S,... 18610 [<sos>, B, IH, L, Z, <space>, AH, V, <space>, ... 43467 [<sos>, L, UH, K, <space>, AW, T, <space>, F, ... 30773 [<sos>, M, OW, S, T, <space>, IH, N, F, L, UW,... 7102 [<sos>, AY, <space>, K, AE, N, <space>, N, AA,... 14877 [<sos>, W, IY, <space>, AA, R, <space>, P, IH,... 5095 [<sos>, DH, EH, R, <space>, IH, Z, <space>, L,... 25014 [<sos>, W, AY, <space>, D, UW, <space>, N, AA,... 38100 [<sos>, AA, N, <space>, DH, AE, T, <space>, AH... count 45814.000000 mean 34.139040 std 17.523979 min 11.000000 25% 21.000000 50% 29.000000 75% 42.000000 max 141.000000 Name: phoneme_count, dtype: float64 Number of sequences with wrong start: 0 Number of sequences with wrong end: 0 Number of sentences with None values: 0 Most common phonemes: [('<space>', 299529), ('AH', 111029), ('T', 91599), ('N', 77726), ('IH', 75183), ('R', 52083), ('S', 50329), ('D', 47510), ('<sos>', 45814), ('<eos>', 45814)] Least common phonemes: [('ZH', 444), ('OY', 1151), ('UH', 5864), ('JH', 6134), ('CH', 6196), ('TH', 6864), ('SH', 7392), ('AW', 8615), ('Y', 11279), ('G', 11628)] Number of sentences with missing phonemes: 0 Empty DataFrame Columns: [sentence, phonemes] Index: []
In [ ]:
# Add tokens to sentences for comparison
def add_tokens_to_sentence(sentence):
return '<sos> ' + sentence.replace(' ', ' <space> ') + ' <eos>'
df['tokenized_sentence'] = df['sentence'].apply(add_tokens_to_sentence)
# Convert the list of phonemes to a space-separated string for display
df['phonemes_str'] = df['phonemes'].apply(lambda x: ' '.join(x))
# Display the tokenized sentences and their corresponding phonemes
sample_comparison = df[['tokenized_sentence', 'phonemes_str']].sample(5)
with pd.option_context('display.max_rows', None, 'display.max_colwidth', None):
display(sample_comparison)
tokenized_sentence | phonemes_str | |
---|---|---|
17720 | <sos> but <space> what <space> it <space> does <space> need <space> is <space> a <space> set <space> of <space> eyes <eos> | <sos> B AH T <space> W AH T <space> IH T <space> D AH Z <space> N IY D <space> IH Z <space> AH <space> S EH T <space> AH V <space> AY Z <eos> |
26979 | <sos> taken <space> very <space> seriously <eos> | <sos> T EY K AH N <space> V EH R IY <space> S IH R IY AH S L IY <eos> |
39564 | <sos> let <space> us <space> find <space> out <space> about <space> one <space> of <space> the <space> most <space> ancient <space> plants <space> on <space> the <space> planet <eos> | <sos> L EH T <space> AH S <space> F AY N D <space> AW T <space> AH B AW T <space> W AH N <space> AH V <space> DH AH <space> M OW S T <space> EY N CH AH N T <space> P L AE N T S <space> AA N <space> DH AH <space> P L AE N AH T <eos> |
32297 | <sos> he <space> might <space> also <space> have <space> been <space> quietly <space> beheaded <eos> | <sos> HH IY <space> M AY T <space> AO L S OW <space> HH AE V <space> B IH N <space> K W AY AH T L IY <space> B IH HH EH D IH D <eos> |
27232 | <sos> within <space> twenty <space> four <space> hours <eos> | <sos> W IH DH IH N <space> T W EH N T IY <space> F AO R <space> AW ER Z <eos> |
In [ ]:
def check_consecutive_special_tokens(sentence_sequences, phoneme_sequences):
special_tokens = ['<eos>', '<sos>', '<space>']
for seq in sentence_sequences:
for token in special_tokens:
if f"{token} {token}" in seq:
print(f"Consecutive {token} found in sentence: {seq}")
for seq in phoneme_sequences:
for token in special_tokens:
if f"{token} {token}" in ' '.join(seq):
print(f"Consecutive {token} found in phoneme: {' '.join(seq)}")
# Example usage:
check_consecutive_special_tokens(df['sentence_with_tokens'], df['phonemes'])
Consecutive <space> found in phoneme: <sos> DH AH <space> AA R CH ER Z <space> L AO S T <space> IH T S <space> EH JH AH K EY SH AH N AH L <space> P ER P AH S <space> IH N <space> W AH N <space> TH AW Z AH N D <space> <space> N AY N <space> HH AH N D R AH D <space> AH N D <space> S EH V AH N T IY <space> T UW <eos> Consecutive <space> found in phoneme: <sos> W IY <space> AA R <space> K AA N S AH N T R EY T IH NG <space> AA N <space> W AH N <space> TH AW Z AH N D <space> <space> N AY N <space> HH AH N D R AH D <space> AH N D <space> S IH K S T IY <space> F AO R <space> AH N D <space> W AH N <space> TH AW Z AH N D <space> <space> N AY N <space> HH AH N D R AH D <space> AH N D <space> S IH K S T IY <space> F AY V <space> AH N D <space> HH IY R IH NG <space> S AH M <space> M AO R <space> P ER S IH N IH L <space> S T AO R IY Z <space> AH V <eos> Consecutive <space> found in phoneme: <sos> L EH T <space> AH S <space> T EY K <space> AH <space> L UH K <space> AE T <space> S AH M <space> AH V <space> DH AH <space> AH DH ER <space> N UW Z <space> HH EH D L AY N Z <space> IH N <space> W AH N <space> TH AW Z AH N D <space> <space> N AY N <space> HH AH N D R AH D <space> AH N D <space> S IH K S T IY <space> S IH K S <space> AH N D <space> S IH K S T IY <space> S EH V AH N <space> AH L AO NG <space> W IH DH <space> S AH M <space> AH V <space> DH AH <space> M Y UW Z IH K <eos> Consecutive <space> found in phoneme: <sos> F OW K AH S IH NG <space> AA N <space> DH AH <space> IH V EH N T S <space> IH N <space> W AH N <space> TH AW Z AH N D <space> <space> N AY N <space> HH AH N D R AH D <space> AH N D <space> S IH K S T IY <space> EY T <space> AH N D <space> W AH N <space> TH AW Z AH N D <space> <space> N AY N <space> HH AH N D R AH D <space> AH N D <space> S IH K S T IY <space> N AY N <eos> Consecutive <space> found in phoneme: <sos> HH UW <space> D AY D <space> IH N <space> W AH N <space> TH AW Z AH N D <space> <space> N AY N <space> HH AH N D R AH D <space> AH N D <space> F AO R T IY <eos> Consecutive <space> found in phoneme: <sos> B IY IH NG <space> R AH S IY V D <space> IH N <space> W AH N <space> TH AW Z AH N D <space> <space> S EH V AH N <space> HH AH N D R AH D <space> AH N D <space> TH ER D IY <space> F AO R <space> AH N D <space> M AE N Y AH F AE K CH ER D <space> DH AE T <space> Y IH R <eos> Consecutive <space> found in phoneme: <sos> IH T <space> IH Z <space> B IH N <space> P R AH D UW S IH NG <space> L OW K AH L <space> EY L <space> S IH N S <space> AE T <space> L IY S T <space> W AH N <space> TH AW Z AH N D <space> <space> S IH K S <space> HH AH N D R AH D <space> AH N D <space> F AO R T IY <space> T UW <eos> Consecutive <space> found in phoneme: <sos> B AH T <space> IH N <space> F AE K T <space> IH N <space> W AH N <space> TH AW Z AH N D <space> <space> F AY V <space> HH AH N D R AH D <space> AH N D <space> T W EH N T IY <space> S EH V AH N <eos> Consecutive <space> found in phoneme: <sos> T UW <space> DH AH <space> T AY M <space> AH V <space> HH IH Z <space> D EH TH <space> IH N <space> W AH N <space> TH AW Z AH N D <space> <space> N AY N <space> HH AH N D R AH D <eos> Consecutive <space> found in phoneme: <sos> AY <space> HH AE V <space> B IH N <space> AH <space> V EH JH AH T EH R IY AH N <space> S IH N S <space> W AH N <space> TH AW Z AH N D <space> <space> N AY N <space> HH AH N D R AH D <space> AH N D <space> S EH V AH N T IY <space> N AY N <eos> Consecutive <space> found in phoneme: <sos> AH <space> V AH N IY SH AH N <space> HH UW <space> K EY M <space> HH IY R <space> B IH T W IY N <space> AH B AW T <space> W AH N <space> TH AW Z AH N D <space> <space> S EH V AH N <space> HH AH N D R AH D <space> AH N D <space> N AY N <space> AH N D <space> W AH N <space> TH AW Z AH N D <space> <space> S EH V AH N <space> HH AH N D R AH D <space> AH N D <space> T EH N <eos> Consecutive <space> found in phoneme: <sos> DH AH <space> AA R CH ER Z <space> F R AH M <space> W AH N <space> TH AW Z AH N D <space> <space> N AY N <space> HH AH N D R AH D <space> AH N D <space> EY T IY <space> T UW <space> DH AH <space> P R EH Z AH N T <space> D EY <eos> Consecutive <space> found in phoneme: <sos> AW ER <space> F ER S T <space> W AA Z <space> CH OW Z AH N <space> F AO R <space> HH ER <space> M AE JH AH S T IY <space> EH S <space> W EH D IH NG <space> T UW <space> P R IH N S <space> F IH L AH P <space> B AE K <space> IH N <space> W AH N <space> TH AW Z AH N D <space> <space> N AY N <space> HH AH N D R AH D <space> AH N D <space> F AO R T IY <space> S EH V AH N <eos> Consecutive <space> found in phoneme: <sos> AH N D <space> B AY <space> DH AH <space> T AY M <space> AH V <space> HH IH Z <space> D EH TH <space> IH N <space> W AH N <space> TH AW Z AH N D <space> <space> EY T <space> HH AH N D R AH D <space> AH N D <space> TH ER D IY <space> F AY V <eos> Consecutive <space> found in phoneme: <sos> IH T <space> W AA Z <space> M EY D <space> IH N <space> AH B AW T <space> W AH N <space> TH AW Z AH N D <space> <space> N AY N <space> HH AH N D R AH D <space> AH N D <space> TH ER D IY <space> F AY V <eos> Consecutive <space> found in phoneme: <sos> W EH N <space> DH AH <space> R EY L W EY <space> S T EY SH AH N <space> W AA Z <space> IH N AO G ER EY T IH D <space> HH IY R <space> IH N <space> AA G AH S T <space> W AH N <space> TH AW Z AH N D <space> <space> EY T <space> HH AH N D R AH D <space> AH N D <space> F IH F T IY <space> W AH N <eos> Consecutive <space> found in phoneme: <sos> P R IY S IH ZH AH N <space> S T R AY K <space> W AA Z <space> W AH T <space> DH AH <space> D AE M B AH S T ER Z <space> W ER <space> AH B AW T <space> IH N <space> W AH N <space> TH AW Z AH N D <space> <space> N AY N <space> HH AH N D R AH D <space> AH N D <space> F AO R T IY <space> TH R IY <space> AH N D <space> DH AE T <space> IH Z <space> V EH R IY <space> M AH CH <space> DH AH <space> S EY M <space> T AH D EY <eos> Consecutive <space> found in phoneme: <sos> W EH N <space> HH EH N R IY <space> N EH L T <space> T UW <space> M EY K <space> HH IH Z <space> W EH D IH NG <space> V AW Z <space> T UW <space> AE N <space> B OW L IH N <space> IH N <space> JH AE N Y UW EH R IY <space> W AH N <space> TH AW Z AH N D <space> <space> F AY V <space> HH AH N D R AH D <space> AH N D <space> TH ER D IY <space> TH R IY <eos> Consecutive <space> found in phoneme: <sos> IH N <space> AA K T OW B ER <space> W AH N <space> TH AW Z AH N D <space> <space> F AY V <space> HH AH N D R AH D <space> AH N D <space> TH ER D IY <space> S EH V AH N <eos> Consecutive <space> found in phoneme: <sos> B AH T <space> AY <space> TH IH NG K <space> IH T <space> IH Z <space> AH <space> W AH N D ER F AH L <space> IY V OW K EY SH AH N <space> AH V <space> DH AH <space> HH AA R D <space> T AY M Z <space> IH N <space> W AH N <space> TH AW Z AH N D <space> <space> N AY N <space> HH AH N D R AH D <space> AH N D <space> F AO R T IY <eos> Consecutive <space> found in phoneme: <sos> DH AE T <space> HH AE D <space> AH <space> AH K <space> T AA P <space> T EH N <space> S IH NG G AH L <space> IH N <space> W AH N <space> TH AW Z AH N D <space> <space> N AY N <space> HH AH N D R AH D <space> AH N D <space> N AY N T IY <space> F AY V <space> W IH DH <space> T ER N <space> AA N <eos> Consecutive <space> found in phoneme: <sos> F ER S T <space> P ER F AO R M D <space> IH N <space> W AH N <space> TH AW Z AH N D <space> <space> EY T <space> HH AH N D R AH D <space> AH N D <space> F AO R T IY <space> S IH K S <eos> Consecutive <space> found in phoneme: <sos> HH IY R <space> IH Z <space> AH <space> R IH L IY <space> G UH D <space> W EY <space> T UW <space> D IY L <space> W IH DH <space> DH EH M <space> IH N <space> DH AH <space> W AH N <space> TH AW Z AH N D <space> <space> N AY N <space> HH AH N D R AH D <space> AH N D <space> EY T IY <space> F AO R <space> G AA R D AH N <eos> Consecutive <space> found in phoneme: <sos> W IH CH <space> AY <space> AE M <space> G EH S IH NG <space> IH Z <space> S AH M TH IH NG <space> B IH T W IY N <space> W AH N <space> TH AW Z AH N D <space> <space> EY T <space> HH AH N D R AH D <space> AH N D <space> T W EH N T IY <eos> Consecutive <space> found in phoneme: <sos> HH IY <space> D AY D <space> IH N <space> W AH N <space> TH AW Z AH N D <space> <space> S EH V AH N <space> HH AH N D R AH D <space> AH N D <space> F IH F T IY <space> F AO R <eos> Consecutive <space> found in phoneme: <sos> DH EY <space> M EH R IY D <space> IH N <space> W AH N <space> TH AW Z AH N D <space> <space> EY T <space> HH AH N D R AH D <space> AH N D <space> N AY N T IY <space> TH R IY <eos> Consecutive <space> found in phoneme: <sos> AH N D <space> HH IY <space> R IH T AY R D <space> IH N <space> AH B AW T <space> W AH N <space> TH AW Z AH N D <space> <space> N AY N <space> HH AH N D R AH D <space> AH N D <space> N AY N T IY <space> W AH N <eos> Consecutive <space> found in phoneme: <sos> M AH Z UH R IY <space> IH N <space> W AH N <space> TH AW Z AH N D <space> <space> N AY N <space> HH AH N D R AH D <space> AH N D <space> F AO R T IY <space> S IH K S <eos> Consecutive <space> found in phoneme: <sos> G AY <space> G AA T <space> HH IH Z <space> N EY M <space> W EH N <space> HH IY <space> ER AY V D <space> AE T <space> DH AH <space> Z UW <space> AA N <space> G AY <space> F AO K S <space> D EY <space> IH N <space> W AH N <space> TH AW Z AH N D <space> <space> N AY N <space> HH AH N D R AH D <space> AH N D <space> F AO R T IY <space> S EH V AH N <eos> Consecutive <space> found in phoneme: <sos> K IH L ER T AH N <space> HH AW S <space> W AA Z <space> B IH L T <space> IH N <space> W AH N <space> TH AW Z AH N D <space> <space> S EH V AH N <space> HH AH N D R AH D <space> AH N D <space> S EH V AH N T IY <space> N AY N <space> F AO R <space> W AH N <space> AH V <space> D EH V AH N <space> EH S <space> OW L D AH S T <space> F AE M AH L IY Z <eos> Consecutive <space> found in phoneme: <sos> K AH M IH NG <space> AH P <space> T UW <space> W AH N <space> TH AW Z AH N D <space> <space> EY T <space> HH AH N D R AH D <space> AH N D <space> AH <space> B IH T <space> B IH AA N D <eos> Consecutive <space> found in phoneme: <sos> IH N <space> ER AW N D <space> AH B AW T <space> DH AH <space> Y IH R <space> W AH N <space> TH AW Z AH N D <space> <space> N AY N <space> HH AH N D R AH D <space> AH N D <space> T EH N <eos> Consecutive <space> found in phoneme: <sos> AH N D <space> HH IY <space> W AA Z <space> B AO R N <space> IH N <space> W AH N <space> TH AW Z AH N D <space> <space> EY T <space> HH AH N D R AH D <space> AH N D <space> EY T IY <space> EY T <eos> Consecutive <space> found in phoneme: <sos> IH N <space> DH AH <space> W IH N T ER <space> AH V <space> W AH N <space> TH AW Z AH N D <space> <space> N AY N <space> HH AH N D R AH D <space> AH N D <space> F IH F T IY <space> EY T <eos> Consecutive <space> found in phoneme: <sos> W IH CH <space> IH T <space> D IH D <space> AH N T IH L <space> W AH N <space> TH AW Z AH N D <space> <space> EY T <space> HH AH N D R AH D <space> AH N D <space> N AY N T IY <space> N AY N <eos> Consecutive <space> found in phoneme: <sos> DH AH <space> L UW S AH T EY N IY AH <space> W AH N <space> DH AH <space> B L UW <space> R AY B AH N D <space> IH N <space> W AH N <space> TH AW Z AH N D <space> <space> N AY N <space> HH AH N D R AH D <space> AH N D <space> S EH V AH N <eos> Consecutive <space> found in phoneme: <sos> P AH B L IH SH T <space> IH N <space> W AH N <space> TH AW Z AH N D <space> <space> N AY N <space> HH AH N D R AH D <space> AH N D <space> W AH N <space> AH N D <space> K AO L D <space> S IH M P L IY <space> P AA V ER T IY <eos> Consecutive <space> found in phoneme: <sos> AY <space> HH AE V <space> B IH N <space> AE N <space> AE K T R AH S <space> S IH N S <space> W AH N <space> TH AW Z AH N D <space> <space> N AY N <space> HH AH N D R AH D <space> AH N D <space> EY T IY <space> F AY V <eos> Consecutive <space> found in phoneme: <sos> IH T <space> W AA Z <space> K AE S T <space> IH N <space> L AO B ER OW <space> IH N <space> W AH N <space> TH AW Z AH N D <space> <space> N AY N <space> HH AH N D R AH D <space> AH N D <space> T W EH N T IY <space> F AO R <eos> Consecutive <space> found in phoneme: <sos> DH EH R <space> W AA Z <space> L EH S <space> F AO R AH S T <space> IH N <space> W AH N <space> TH AW Z AH N D <space> <space> N AY N <space> HH AH N D R AH D <space> DH AE N <space> DH EH R <space> HH AE D <space> B IH N <space> F AO R <space> T EH N <eos> Consecutive <space> found in phoneme: <sos> IH T <space> HH AE Z <space> B IH N <space> L EY D <space> AH P <space> S IH N S <space> IH T <space> B R OW K <space> D AW N <space> IH N <space> W AH N <space> TH AW Z AH N D <space> <space> N AY N <space> HH AH N D R AH D <space> AH N D <space> TH ER D IY <space> F AO R <eos> Consecutive <space> found in phoneme: <sos> AY <space> W AA Z <space> L AH K IY <space> IH N AH F <space> T UW <space> S IH T <space> W IH DH <space> AH <space> G AY <space> HH UW <space> W AH N <space> DH AH <space> F ER S T <space> B R IH T IH SH <space> R AE L IY <space> CH AE M P IY AH N SH IH P <space> IH N <space> W AH N <space> TH AW Z AH N D <space> <space> N AY N <space> HH AH N D R AH D <space> AH N D <space> F IH F T IY <space> EY T <eos> Consecutive <space> found in phoneme: <sos> IH T <space> M EY D <space> AH <space> R IY L <space> IH M P R EH SH AH N <space> AA N <space> K AE P T AH N <space> K UH K <space> W EH N <space> HH IY <space> K EY M <space> HH IY R <space> IH N <space> W AH N <space> TH AW Z AH N D <space> <space> S EH V AH N <space> HH AH N D R AH D <space> AH N D <space> S EH V AH N T IY <eos> Consecutive <space> found in phoneme: <sos> S OW <space> F R AH M <space> W AH N <space> TH AW Z AH N D <space> <space> N AY N <space> HH AH N D R AH D <space> AH N D <space> F AO R T IY <space> T UW <space> AA N W ER D Z <eos> Consecutive <space> found in phoneme: <sos> DH AE T <space> IH Z <space> W EH R <space> AY <space> S T AA R T AH D <space> M AY <space> B IY B IY S IY <space> K ER IH R <space> B AE K <space> IH N <space> W AH N <space> TH AW Z AH N D <space> <space> N AY N <space> HH AH N D R AH D <space> AH N D <space> N AY N T IY <space> TH R IY <eos> Consecutive <space> found in phoneme: <sos> IH T <space> F ER S T <space> AH P IH R D <space> AA N <space> B IY B IY S IY <space> T UW <space> IH N <space> W AH N <space> TH AW Z AH N D <space> <space> N AY N <space> HH AH N D R AH D <space> AH N D <space> S EH V AH N T IY <space> EY T <eos> Consecutive <space> found in phoneme: <sos> R AY T <space> TH R UW <space> T UW <space> AH B AW T <space> W AH N <space> TH AW Z AH N D <space> <space> EY T <space> HH AH N D R AH D <space> AH N D <space> F IH F T IY <eos> Consecutive <space> found in phoneme: <sos> DH AH <space> ER IH JH AH N AH L <space> W AH N <space> W AA Z <space> N AA K T <space> D AW N <space> T UW <space> B IY <space> R IY P L EY S T <space> B AY <space> DH IH S <space> W AH N <space> IH N <space> W AH N <space> TH AW Z AH N D <space> <space> EY T <space> HH AH N D R AH D <space> AH N D <space> EY T IY <space> S EH V AH N <space> AH N D <space> W AH T <space> IH Z <space> IY V IH N <space> M AO R <eos> Consecutive <space> found in phoneme: <sos> B IH K AO Z <space> W EH N <space> SH IY <space> D AY D <space> IH N <space> W AH N <space> TH AW Z AH N D <space> <space> EY T <space> HH AH N D R AH D <space> AH N D <space> F AO R T IY N <eos> Consecutive <space> found in phoneme: <sos> HH UW <space> W AA Z <space> B AO R N <space> IH N <space> W AH N <space> TH AW Z AH N D <space> <space> N AY N <space> HH AH N D R AH D <space> AH N D <space> F AO R T IY <space> EY T <eos> Consecutive <space> found in phoneme: <sos> AE N <space> OW L D <space> S IH N AH G AO G <space> D EY T IH NG <space> F R AH M <space> W AH N <space> TH AW Z AH N D <space> <space> TH R IY <space> HH AH N D R AH D <space> AH N D <space> TH ER D IY <space> EY T <eos> Consecutive <space> found in phoneme: <sos> DH AH <space> R IY L <space> L AE S T <space> IH N V EY ZH AH N <space> AE K CH AH W AH L IY <space> K EY M <space> HH IY R <space> IH N <space> W AH N <space> TH AW Z AH N D <space> <space> F AY V <space> HH AH N D R AH D <space> AH N D <space> F AO R T IY <space> F AY V <eos> Consecutive <space> found in phoneme: <sos> P AH L IH T AH K AH L <space> P R EH SH ER <space> L EH D <space> T UW <space> DH AH <space> P AE S AH JH <space> AH V <space> DH AH <space> W AH N <space> TH AW Z AH N D <space> <space> N AY N <space> HH AH N D R AH D <space> AH N D <space> F AY V <space> EY L IY AH N Z <space> AE K T <eos> Consecutive <space> found in phoneme: <sos> AH N D <space> S EH T <space> AH G EH N S T <space> DH AH <space> B AE K D R AA P <space> AH V <space> DH AH <space> W AH N <space> TH AW Z AH N D <space> <space> N AY N <space> HH AH N D R AH D <space> AH N D <space> EY T IY <space> F AO R <space> M AY N ER Z <space> S T R AY K <eos> Consecutive <space> found in phoneme: <sos> IH N D IY D <space> B IH T W IY N <space> W AH N <space> TH AW Z AH N D <space> <space> N AY N <space> HH AH N D R AH D <space> AH N D <space> F AO R T IY <space> F AY V <space> AH N D <space> W AH N <space> TH AW Z AH N D <space> <space> N AY N <space> HH AH N D R AH D <space> AH N D <space> S IH K S T IY <space> F AY V <space> IH T <space> IH Z <space> F EH R <space> T UW <space> S EY <space> DH AE T <space> HH IY <space> K AH N T R IH B Y UW T IH D <space> M AO R <eos> Consecutive <space> found in phoneme: <sos> B IH L D IH NG <space> W ER K <space> S T AA R T AH D <space> IH N <space> W AH N <space> TH AW Z AH N D <space> <space> S EH V AH N <space> HH AH N D R AH D <space> AH N D <space> TH ER D IY <space> N AY N <eos> Consecutive <space> found in phoneme: <sos> DH AH <space> HH AW S <space> W AA Z <space> B IH L T <space> IH N <space> W AH N <space> TH AW Z AH N D <space> <space> S IH K S <space> HH AH N D R AH D <space> AH N D <space> S IH K S <eos> Consecutive <space> found in phoneme: <sos> AH <space> P AH B <space> CH EY N <space> EH S <space> B IH N <space> K R IH T AH S AY Z D <space> F AO R <space> DH IH S <space> D AH B AH L <space> D OW N AH T <space> B ER G ER <space> W IH DH <space> W AH N <space> TH AW Z AH N D <space> <space> N AY N <space> HH AH N D R AH D <space> AH N D <space> N AY N T IY <space> S IH K S <eos> Consecutive <space> found in phoneme: <sos> AH <space> K AA P IY <space> AH V <space> AH <space> M Y UW T ER S AY K AH N G <space> M AE G AH Z IY N <space> F R AH M <space> W AH N <space> TH AW Z AH N D <space> <space> N AY N <space> HH AH N D R AH D <space> AH N D <space> S EH V AH N T IY <space> S IH K S <eos> Consecutive <space> found in phoneme: <sos> M AO R <space> R IY S AH N T L IY <space> IH N <space> W AH N <space> TH AW Z AH N D <space> <space> N AY N <space> HH AH N D R AH D <space> AH N D <space> F AO R T IY <space> N AY N <eos> Consecutive <space> found in phoneme: <sos> EH V ER <space> S IH N S <space> IH T S <space> F ER S T <space> AH P IH R AH N S <space> IH N <space> AH <space> B L AE K <space> AH N D <space> W AY T <space> S K R AE P Y AA R D <space> IH N <space> W AH N <space> TH AW Z AH N D <space> <space> N AY N <space> HH AH N D R AH D <space> AH N D <space> S IH K S T IY <space> TH R IY <eos> Consecutive <space> found in phoneme: <sos> AH N D <space> IH N <space> D IH S EH M B ER <space> W AH N <space> TH AW Z AH N D <space> <space> F AY V <space> HH AH N D R AH D <space> AH N D <space> T W EH N T IY <space> S EH V AH N <eos> Consecutive <space> found in phoneme: <sos> OW V ER <space> S IH K S <space> D EY Z <space> IH N <space> W AH N <space> TH AW Z AH N D <space> <space> N AY N <space> HH AH N D R AH D <space> AH N D <space> F IH F T IY <space> EY T <eos> Consecutive <space> found in phoneme: <sos> S AH M <space> W AH N <space> TH AW Z AH N D <space> <space> EY T <space> HH AH N D R AH D <space> Y IH R Z <space> AH G OW <eos> Consecutive <space> found in phoneme: <sos> AH N D <space> DH IH S <space> M EH G AH F OW N <space> D EY T S <space> F R AH M <space> W AH N <space> TH AW Z AH N D <space> <space> N AY N <space> HH AH N D R AH D <space> AH N D <space> S IH K S T IY <space> TH R IY <eos> Consecutive <space> found in phoneme: <sos> T UW <space> TH AW Z AH N D <space> AH N D <space> T EH N <space> W IY <space> IH N HH EH R AH T IH D <space> DH AH <space> L OW AH S T <space> L EH V AH L <space> AH V <space> B IH L D IH NG Z <space> S IH N S <space> W AH N <space> TH AW Z AH N D <space> <space> N AY N <space> HH AH N D R AH D <space> AH N D <space> T W EH N T IY <space> TH R IY <eos> Consecutive <space> found in phoneme: <sos> DH AH <space> F ER S T <space> R EH F ER AH N S <space> T UW <space> DH AH <space> B AO R AH S T OW N <space> IH Z <space> F R AH M <space> W AH N <space> TH AW Z AH N D <space> <space> S EH V AH N <space> HH AH N D R AH D <space> AH N D <space> T W EH N T IY <space> TH R IY <eos> Consecutive <space> found in phoneme: <sos> HH UW <space> W AH N <space> AH <space> B EH S T <space> AE K T R AH S <space> AO S K ER <space> IH N <space> W AH N <space> TH AW Z AH N D <space> <space> N AY N <space> HH AH N D R AH D <space> AH N D <space> EY T IY <space> EY T <space> F AO R <space> DH AH <space> F IH L M <space> M UW N S T R AH K <eos> Consecutive <space> found in phoneme: <sos> W EY <space> B AE K <space> IH N <space> W AH N <space> TH AW Z AH N D <space> <space> N AY N <space> HH AH N D R AH D <space> AH N D <space> N AY N T IY <space> W AH N <eos> Consecutive <space> found in phoneme: <sos> HH UW <space> W AA Z <space> IH M P L IH K EY T IH D <space> IH N <space> DH AH <space> G AH N P AW D ER <space> P L AA T <space> IH N <space> W AH N <space> TH AW Z AH N D <space> <space> S IH K S <space> HH AH N D R AH D <space> AH N D <space> F AY V <eos> Consecutive <space> found in phoneme: <sos> DH AH <space> S T AA R <space> AH V <space> DH AH <space> S T AA R <space> T R EH K <space> S IH R IY Z <space> AH N D <space> F IH L M Z <space> B IY M D <space> D AW N <space> T UW <space> DH AH <space> W OW G AH N <space> S T UW D IY OW <space> IH N <space> W AH N <space> TH AW Z AH N D <space> <space> N AY N <space> HH AH N D R AH D <space> AH N D <space> EY T IY <space> N AY N <eos> Consecutive <space> found in phoneme: <sos> W EH N <space> B EH T IY <space> D EY V AH S <space> K EY M <space> AA N <space> DH AH <space> SH OW <space> B AE K <space> IH N <space> W AH N <space> TH AW Z AH N D <space> <space> N AY N <space> HH AH N D R AH D <space> AH N D <space> EY T IY <space> S EH V AH N <eos> Consecutive <space> found in phoneme: <sos> IH N <space> W IH CH <space> AY <space> F L AY <space> AH <space> W AH N <space> TH AW Z AH N D <space> <space> N AY N <space> HH AH N D R AH D <space> AH N D <space> T W EH N T IY <space> S IH K S <space> S T IH R M AH N <eos> Consecutive <space> found in phoneme: <sos> IH N <space> W AH N <space> TH AW Z AH N D <space> <space> EY T <space> HH AH N D R AH D <space> AH N D <space> T W EH N T IY <space> EY T <space> DH AE T <space> DH AH <space> F ER S T <space> F R EH N CH <space> AH N Y AH N <space> S EH L ER <space> D IH S AY D IH D <space> T UW <space> T R AY <space> HH IH Z <space> L AH K <space> AH N D <space> K R AO S <space> DH AH <eos> Consecutive <space> found in phoneme: <sos> AH K AO R D IH NG <space> T UW <space> DH AH <space> W AH N <space> TH AW Z AH N D <space> <space> N AY N <space> HH AH N D R AH D <space> AH N D <space> IH L EH V AH N <space> S EH N S AH S <eos> Consecutive <space> found in phoneme: <sos> IH N <space> W AH N <space> TH AW Z AH N D <space> <space> EY T <space> HH AH N D R AH D <space> AH N D <space> T W EH N T IY <space> EY T <space> DH AE T <space> DH AH <space> F ER S T <space> F R EH N CH <space> AH N Y AH N <space> S EH L ER <space> D IH S AY D IH D <space> T UW <space> T R AY <space> HH IH Z <space> L AH K <space> AH N D <space> K R AO S <space> DH AH <eos> Consecutive <space> found in phoneme: <sos> IH T <space> W AA Z <space> AE T <space> DH IH S <space> V EH R IY <space> S P AA T <space> IH N <space> W AH N <space> TH AW Z AH N D <space> <space> S IH K S <space> HH AH N D R AH D <space> AH N D <space> F IH F T IY <space> W AH N <space> DH AE T <space> CH AA R L Z <space> IY <space> IH S K EY P T <space> K AE P CH ER <space> B AY <eos> Consecutive <space> found in phoneme: <sos> W IY <space> HH AE V <space> N AA T <space> HH AE D <space> AE N <space> AA R M IY <space> S IH N S <space> W AH N <space> TH AW Z AH N D <space> <space> N AY N <space> HH AH N D R AH D <space> AH N D <space> F AO R T IY <space> EY T <eos> Consecutive <space> found in phoneme: <sos> S OW <space> DH EY <space> JH OY N D <space> F AO R S IH Z <space> W IH DH <space> DH AH <space> AO S T R IY AH N Z <space> AH N D <space> B AY <space> W AH N <space> TH AW Z AH N D <space> <space> S EH V AH N <space> HH AH N D R AH D <space> AH N D <space> S IH K S <eos> Consecutive <space> found in phoneme: <sos> S UW N <space> AE F T ER <space> DH AH <space> N AA T S IY Z <space> K EY M <space> T UW <space> P AW ER <space> IH N <space> W AH N <space> TH AW Z AH N D <space> <space> N AY N <space> HH AH N D R AH D <space> AH N D <space> TH ER D IY <space> TH R IY <eos> Consecutive <space> found in phoneme: <sos> AH N D <space> SH IY <space> HH AE D <space> DH IH S <space> AH F EH R <space> W IH DH <space> EH D W ER D <space> B IH T W IY N <space> W AH N <space> TH AW Z AH N D <space> <space> N AY N <space> HH AH N D R AH D <space> AH N D <space> EY T IY N <space> AH N D <space> W AH N <space> TH AW Z AH N D <space> <space> N AY N <space> HH AH N D R AH D <space> AH N D <space> T W EH N T IY <space> TH R IY <eos> Consecutive <space> found in phoneme: <sos> B IH K AO Z <space> IH N <space> W AH N <space> TH AW Z AH N D <space> <space> N AY N <space> HH AH N D R AH D <space> AH N D <space> EY T IY <space> W AH N <space> W IY <space> W ER <space> R IH L IY <space> AE T <space> DH AH <space> T IH P IH NG <space> P OY N T <space> B IH T W IY N <space> DH AH <space> T ER B OW <eos> Consecutive <space> found in phoneme: <sos> AY <space> S IY <space> IH T <space> W AA Z <space> R IH T AH N <space> IH N <space> JH UW N <space> W AH N <space> TH AW Z AH N D <space> <space> N AY N <space> HH AH N D R AH D <space> AH N D <space> S IH K S T IY <space> S EH V AH N <eos> Consecutive <space> found in phoneme: <sos> IH T <space> IH Z <space> B EY S T <space> AA N <space> HH IH Z <space> S EH L F <space> P AO R T R AH T <space> AH V <space> W AH N <space> TH AW Z AH N D <space> <space> S EH V AH N <space> HH AH N D R AH D <space> AH N D <space> F AO R T IY <space> F AY V <eos> Consecutive <space> found in phoneme: <sos> DH IH S <space> S AY T <space> W AA Z <space> AE N <space> R AE F <space> EH R <space> B EY S <space> F R AH M <space> W AH N <space> TH AW Z AH N D <space> <space> N AY N <space> HH AH N D R AH D <space> AH N D <space> EY T IY N <space> AH N T IH L <space> W AH N <space> TH AW Z AH N D <space> <space> N AY N <space> HH AH N D R AH D <space> AH N D <space> S IH K S T IY <space> W AH N <space> AH N D <space> N AW <eos> Consecutive <space> found in phoneme: <sos> IH N <space> DH AH <space> S AH M ER <space> AH V <space> W AH N <space> TH AW Z AH N D <space> <space> N AY N <space> HH AH N D R AH D <space> AH N D <space> F AO R T IY <eos> Consecutive <space> found in phoneme: <sos> AE T <space> DH AH <space> B AE T AH L <space> AH V <space> K W IH B ER OW N <space> B EY <space> IH N <space> W AH N <space> TH AW Z AH N D <space> <space> S EH V AH N <space> HH AH N D R AH D <space> AH N D <space> F IH F T IY <space> N AY N <eos> Consecutive <space> found in phoneme: <sos> IH N <space> DH AH <space> S AH M ER <space> AH V <space> W AH N <space> TH AW Z AH N D <space> <space> N AY N <space> HH AH N D R AH D <space> AH N D <space> F AO R T IY <eos> Consecutive <space> found in phoneme: <sos> HH AE V IH NG <space> B IH N <space> K AH M P L IY T AH D <space> IH N <space> W AH N <space> TH AW Z AH N D <space> <space> N AY N <space> HH AH N D R AH D <space> AH N D <space> S IH K S T IY <eos> Consecutive <space> found in phoneme: <sos> W IY <space> S AH D AH N L IY <space> EH M B AA R K T <space> AA N <space> AH <space> HH EH D L AO NG <space> R AH SH <space> T UW <space> G EH T <space> R IH D <space> AH V <space> S T IY M <space> F R AH M <space> W AH N <space> TH AW Z AH N D <space> <space> N AY N <space> HH AH N D R AH D <space> AH N D <space> S IH K S T IY <space> TH R IY <space> T UW <eos> Consecutive <space> found in phoneme: <sos> R AY T <space> AH P <space> T UW <space> W AH N <space> TH AW Z AH N D <space> <space> EY T <space> HH AH N D R AH D <space> AH N D <space> T W EH N T IY <space> S IH K S <eos> Consecutive <space> found in phoneme: <sos> HH IY <space> P AE T AH N T AH D <space> DH AH <space> S AH L IH N D R IH K AH L <space> S L AY D <space> R UW L <space> IH N <space> W AH N <space> TH AW Z AH N D <space> <space> EY T <space> HH AH N D R AH D <space> AH N D <space> S EH V AH N T IY <space> EY T <eos> Consecutive <space> found in phoneme: <sos> B IH K AO Z <space> HH IY <space> K EY M <space> T UW <space> P AW ER <space> IH N <space> AH <space> M IH L AH T EH R IY <space> K UW <space> IH N <space> W AH N <space> TH AW Z AH N D <space> <space> N AY N <space> HH AH N D R AH D <space> AH N D <space> S EH V AH N T IY <space> S EH V AH N <eos> Consecutive <space> found in phoneme: <sos> DH AH <space> L AE S T <space> AA B Z ER V EY SH AH N <space> T UW <space> B IY <space> D AH N <space> HH IY R <space> W AA Z <space> M EY D <space> IH N <space> W AH N <space> TH AW Z AH N D <space> <space> N AY N <space> HH AH N D R AH D <space> AH N D <space> F IH F T IY <space> F AO R <eos> Consecutive <space> found in phoneme: <sos> IH N <space> D IH S EH M B ER <space> W AH N <space> TH AW Z AH N D <space> <space> N AY N <space> HH AH N D R AH D <space> AH N D <space> F IH F T IY <space> W AH N <eos> Consecutive <space> found in phoneme: <sos> S IH N S <space> K AA M IH K <space> R IH L IY F <space> S T AA R T AH D <space> IH N <space> W AH N <space> TH AW Z AH N D <space> <space> N AY N <space> HH AH N D R AH D <space> AH N D <space> EY T IY <space> F AY V <eos> Consecutive <space> found in phoneme: <sos> HH UW <space> W AA Z <space> B AO R N <space> IH N <space> W AH N <space> TH AW Z AH N D <space> <space> N AY N <space> HH AH N D R AH D <space> AH N D <space> IH L EH V AH N <eos> Consecutive <space> found in phoneme: <sos> AH N D <space> AE Z <space> AH <space> CH AY L D <space> IH N <space> W AH N <space> TH AW Z AH N D <space> <space> N AY N <space> HH AH N D R AH D <space> AH N D <space> S IH K S T IY <space> S EH V AH N <eos> Consecutive <space> found in phoneme: <sos> HH IY <space> W AA Z <space> W ER K IH NG <space> AW T <space> IH N <space> IY S T <space> AE F R AH K AH <space> IH N <space> W AH N <space> TH AW Z AH N D <space> <space> N AY N <space> HH AH N D R AH D <space> AH N D <space> S EH V AH N T IY <space> TH R IY <space> W EH N <space> HH IY <space> K EY M <space> AH K R AO S <space> AH <space> F AH S IH L AH S T <space> B OW N <space> DH AE T <eos> Consecutive <space> found in phoneme: <sos> IH T <space> S T AA R T AH D <space> IH N <space> W AH N <space> TH AW Z AH N D <space> <space> N AY N <space> HH AH N D R AH D <space> AH N D <space> N AY N T IY <space> F AO R <eos> Consecutive <space> found in phoneme: <sos> JH AH S T <space> AH N AH DH ER <space> T UW <space> TH AW Z AH N D <space> <space> TH R IY <space> HH AH N D R AH D <space> T UW <space> G OW <eos> Consecutive <space> found in phoneme: <sos> DH IH S <space> B UH K <space> W AA Z <space> P AH B L IH SH T <space> IH N <space> W AH N <space> TH AW Z AH N D <space> <space> EY T <space> HH AH N D R AH D <space> AH N D <space> F IH F T IY <space> T UW <eos> Consecutive <space> found in phoneme: <sos> IH N T R AH D UW S T <space> IH N T UW <space> S ER V AH S <space> IH N <space> W AH N <space> TH AW Z AH N D <space> <space> N AY N <space> HH AH N D R AH D <space> AH N D <space> S IH K S T IY <space> N AY N <eos> Consecutive <space> found in phoneme: <sos> R AY T <space> AH P <space> AH N T IH L <space> W AH N <space> TH AW Z AH N D <space> <space> N AY N <space> HH AH N D R AH D <space> AH N D <space> N AY N T IY <space> EY T <eos> Consecutive <space> found in phoneme: <sos> DH AE T <space> W AA Z <space> IH N <space> W AH N <space> TH AW Z AH N D <space> <space> N AY N <space> HH AH N D R AH D <space> AH N D <space> S EH V AH N T IY <space> TH R IY <eos> Consecutive <space> found in phoneme: <sos> AH N D <space> DH AE T <space> T UH K <space> P L EY S <space> IH N <space> W AH N <space> TH AW Z AH N D <space> <space> EY T <space> HH AH N D R AH D <space> AH N D <space> TH ER D IY <space> F AY V <eos> Consecutive <space> found in phoneme: <sos> W EH N <space> IH T <space> OW P AH N D <space> IH N <space> W AH N <space> TH AW Z AH N D <space> <space> EY T <space> HH AH N D R AH D <space> AH N D <space> T W EH N T IY <space> S IH K S <eos> Consecutive <space> found in phoneme: <sos> W IY <space> G AA T <space> IH T <space> IH N <space> W AH N <space> TH AW Z AH N D <space> <space> EY T <space> HH AH N D R AH D <space> AH N D <space> F AO R T IY <space> EY T <eos> Consecutive <space> found in phoneme: <sos> DH IY Z <space> W ER <space> N OW N <space> AE Z <space> DH AH <space> AO S T EH R IH T IY <space> G EY M Z <space> IH N <space> W AH N <space> TH AW Z AH N D <space> <space> N AY N <space> HH AH N D R AH D <space> AH N D <space> F AO R T IY <space> EY T <eos> Consecutive <space> found in phoneme: <sos> DH IY Z <space> W ER <space> N OW N <space> AE Z <space> DH AH <space> AO S T EH R IH T IY <space> G EY M Z <space> IH N <space> W AH N <space> TH AW Z AH N D <space> <space> N AY N <space> HH AH N D R AH D <space> AH N D <space> F AO R T IY <space> EY T <eos> Consecutive <space> found in phoneme: <sos> S OW <space> AY <space> R EH K AH N <space> AH B AW T <space> W AH N <space> TH AW Z AH N D <space> <space> S EH V AH N <space> HH AH N D R AH D <space> AH N D <space> N AY N T IY <eos> Consecutive <space> found in phoneme: <sos> P AH B L IH SH T <space> IH N <space> W AH N <space> TH AW Z AH N D <space> <space> N AY N <space> HH AH N D R AH D <space> AH N D <space> EY T IY <space> F AO R <eos> Consecutive <space> found in phoneme: <sos> AY <space> TH IH NG K <space> IH N <space> JH ER M AH N IY <space> IH T <space> S T AA R T AH D <space> W IH DH <space> DH AH <space> R EH V AH L UW SH AH N <space> IH N <space> W AH N <space> TH AW Z AH N D <space> <space> EY T <space> HH AH N D R AH D <space> AH N D <space> F AO R T IY <space> EY T <eos> Consecutive <space> found in phoneme: <sos> AH N D <space> DH AE T <space> W AA Z <space> AH B AW T <space> W AH N <space> TH AW Z AH N D <space> <space> EY T <space> HH AH N D R AH D <space> AH N D <space> F AO R T IY <space> T UW <eos>
In [ ]:
def count_consecutive_special_tokens(sentence_sequences, phoneme_sequences):
special_tokens = ['<eos>', '<sos>', '<space>']
count = 0
for seq in sentence_sequences:
for token in special_tokens:
if f"{token} {token}" in seq:
count += 1
for seq in phoneme_sequences:
for token in special_tokens:
if f"{token} {token}" in ' '.join(seq):
count += 1
return count
# Example usage:
count = count_consecutive_special_tokens(df['sentence_with_tokens'], df['phonemes'])
print(f"Number of sentences with consecutive special tokens: {count}")
Number of sentences with consecutive special tokens: 114
In [ ]:
import matplotlib.pyplot as plt
import seaborn as sns
# Count consecutive special tokens
count = count_consecutive_special_tokens(df['sentence_with_tokens'], df['phonemes'])
# Data for visualization
labels = ['Sentences with Consecutive Tokens', 'Total Sentences']
values = [count, len(df) - count]
percentages = [value / len(df) * 100 for value in values]
# Colors for the visualizations
colors = ['#3498DB', '#E74C3C']
# Visualization
plt.figure(figsize=(10, 6))
sns.set_context("talk", font_scale=0.8)
bars = sns.barplot(x=labels, y=values, palette=colors)
# Annotate the bars with the count value and percentage
for index, (value, percentage) in enumerate(zip(values, percentages)):
plt.text(index, value + (0.02 * max(values)),
f"{value} ({percentage:.1f}%)",
ha='center', va='center', fontweight='bold', fontsize=14)
# Set title and labels
plt.title('Sentences with Consecutive Special Tokens vs. Total Sentences', fontsize=15)
plt.ylabel('Number of Sentences', fontsize=13)
plt.xticks(fontsize=12)
# Ensure the text fits within the figure bounds
plt.tight_layout()
# Show the plot
plt.show()
In [ ]:
df.head()
Out[ ]:
sentence | word_count | original_sentence | word_count_after_conversion | changed | phonemes | num_spaces | phonemes_str | sentence_with_tokens | phoneme_count | tokenized_sentence | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | when you are cooking chips at home | 7 | WHEN YOU'RE COOKING CHIPS AT HOME | 6 | False | [<sos>, W, EH, N, <space>, Y, UW, <space>, AA,... | 6 | <sos> W EH N <space> Y UW <space> AA R <space>... | <sos> when <space> you <space> are <space> coo... | 29 | <sos> when <space> you <space> are <space> coo... |
1 | the traditional chip pan often stays on the shelf | 9 | THE TRADITIONAL CHIP PAN OFTEN STAYS ON THE SHELF | 9 | False | [<sos>, DH, AH, <space>, T, R, AH, D, IH, SH, ... | 8 | <sos> DH AH <space> T R AH D IH SH AH N AH L <... | <sos> the <space> traditional <space> chip <sp... | 44 | <sos> the <space> traditional <space> chip <sp... |
2 | through what they call a knife block | 7 | THROUGH WHAT THEY CALL A KNIFE BLOCK | 7 | False | [<sos>, TH, R, UW, <space>, W, AH, T, <space>,... | 6 | <sos> TH R UW <space> W AH T <space> DH EY <sp... | <sos> through <space> what <space> they <space... | 27 | <sos> through <space> what <space> they <space... |
3 | which involves firing a potato down a pipe | 8 | WHICH INVOLVES FIRING A POTATO DOWN A PIPE | 8 | False | [<sos>, W, IH, CH, <space>, IH, N, V, AA, L, V... | 7 | <sos> W IH CH <space> IH N V AA L V Z <space> ... | <sos> which <space> involves <space> firing <s... | 38 | <sos> which <space> involves <space> firing <s... |
4 | apart from the golden colour and the delicious... | 9 | APART FROM THE GOLDEN COLOUR AND THE DELICIOUS... | 9 | False | [<sos>, AH, P, AA, R, T, <space>, F, R, AH, M,... | 8 | <sos> AH P AA R T <space> F R AH M <space> DH ... | <sos> apart <space> from <space> the <space> g... | 49 | <sos> apart <space> from <space> the <space> g... |
In [ ]:
def has_consecutive_special_tokens(seq):
special_tokens = ['<eos>', '<sos>', '<space>']
for token in special_tokens:
if f"{token} {token}" in seq:
return True
return False
# Create a mask that is True for rows without consecutive special tokens
mask = ~df['sentence_with_tokens'].apply(has_consecutive_special_tokens) & ~df['phonemes'].apply(lambda x: has_consecutive_special_tokens(' '.join(x)))
# Index df with the mask
df = df[mask]
print(df)
sentence word_count \ 0 when you are cooking chips at home 7 1 the traditional chip pan often stays on the shelf 9 2 through what they call a knife block 7 3 which involves firing a potato down a pipe 8 4 apart from the golden colour and the delicious... 9 ... ... ... 45834 when he is not having his seizures when he is ... 13 45835 she wants attention from both of us and 8 45836 as much as we try to give it to her 10 45837 they so deserve 3 45838 thank you enough for what you have done 8 original_sentence \ 0 WHEN YOU'RE COOKING CHIPS AT HOME 1 THE TRADITIONAL CHIP PAN OFTEN STAYS ON THE SHELF 2 THROUGH WHAT THEY CALL A KNIFE BLOCK 3 WHICH INVOLVES FIRING A POTATO DOWN A PIPE 4 APART FROM THE GOLDEN COLOUR AND THE DELICIOUS... ... ... 45834 WHEN HE'S NOT HAVING HIS SEIZURES WHEN HE'S NO... 45835 SHE WANTS ATTENTION FROM BOTH OF US AND 45836 AS MUCH AS WE TRY TO GIVE IT TO HER 45837 THEY SO DESERVE 45838 THANK YOU ENOUGH FOR WHAT YOU'VE DONE word_count_after_conversion changed \ 0 6 False 1 9 False 2 7 False 3 8 False 4 9 False ... ... ... 45834 11 False 45835 8 False 45836 10 False 45837 3 False 45838 7 False phonemes num_spaces \ 0 [<sos>, W, EH, N, <space>, Y, UW, <space>, AA,... 6 1 [<sos>, DH, AH, <space>, T, R, AH, D, IH, SH, ... 8 2 [<sos>, TH, R, UW, <space>, W, AH, T, <space>,... 6 3 [<sos>, W, IH, CH, <space>, IH, N, V, AA, L, V... 7 4 [<sos>, AH, P, AA, R, T, <space>, F, R, AH, M,... 8 ... ... ... 45834 [<sos>, W, EH, N, <space>, HH, IY, <space>, IH... 12 45835 [<sos>, SH, IY, <space>, W, AA, N, T, S, <spac... 7 45836 [<sos>, AE, Z, <space>, M, AH, CH, <space>, AE... 9 45837 [<sos>, DH, EY, <space>, S, OW, <space>, D, IH... 2 45838 [<sos>, TH, AE, NG, K, <space>, Y, UW, <space>... 7 phonemes_str \ 0 <sos> W EH N <space> Y UW <space> AA R <space>... 1 <sos> DH AH <space> T R AH D IH SH AH N AH L <... 2 <sos> TH R UW <space> W AH T <space> DH EY <sp... 3 <sos> W IH CH <space> IH N V AA L V Z <space> ... 4 <sos> AH P AA R T <space> F R AH M <space> DH ... ... ... 45834 <sos> W EH N <space> HH IY <space> IH Z <space... 45835 <sos> SH IY <space> W AA N T S <space> AH T EH... 45836 <sos> AE Z <space> M AH CH <space> AE Z <space... 45837 <sos> DH EY <space> S OW <space> D IH Z ER V <... 45838 <sos> TH AE NG K <space> Y UW <space> IH N AH ... sentence_with_tokens phoneme_count \ 0 <sos> when <space> you <space> are <space> coo... 29 1 <sos> the <space> traditional <space> chip <sp... 44 2 <sos> through <space> what <space> they <space... 27 3 <sos> which <space> involves <space> firing <s... 38 4 <sos> apart <space> from <space> the <space> g... 49 ... ... ... 45834 <sos> when <space> he <space> is <space> not <... 54 45835 <sos> she <space> wants <space> attention <spa... 37 45836 <sos> as <space> much <space> as <space> we <s... 34 45837 <sos> they <space> so <space> deserve <eos> 13 45838 <sos> thank <space> you <space> enough <space>... 33 tokenized_sentence 0 <sos> when <space> you <space> are <space> coo... 1 <sos> the <space> traditional <space> chip <sp... 2 <sos> through <space> what <space> they <space... 3 <sos> which <space> involves <space> firing <s... 4 <sos> apart <space> from <space> the <space> g... ... ... 45834 <sos> when <space> he <space> is <space> not <... 45835 <sos> she <space> wants <space> attention <spa... 45836 <sos> as <space> much <space> as <space> we <s... 45837 <sos> they <space> so <space> deserve <eos> 45838 <sos> thank <space> you <space> enough <space>... [45700 rows x 11 columns]
In [ ]:
space_sentences = df[df['phonemes'].apply(lambda x: ' ' in x)]
print(space_sentences[['sentence', 'phonemes']])
Empty DataFrame Columns: [sentence, phonemes] Index: []
In [ ]:
import sys
sys.path.append('/content/drive/MyDrive/Dissertation')
from label_vectorization import SentenceVectorizer
In [ ]:
# Get the 10 most common phonemes
most_common_phonemes = phoneme_freq.most_common(10)
# Print the 10 most common phonemes
print("10 Most Common Phonemes:")
for phoneme, count in most_common_phonemes:
print(f"{phoneme}: {count}")
# Set up the visualization with a refined style and context
sns.set_style("whitegrid")
sns.set_context("talk")
plt.figure(figsize=(15, 8))
# Extract phoneme names and their counts
phonemes = [phoneme for phoneme, _ in most_common_phonemes]
counts = [count for _, count in most_common_phonemes]
# Use a sophisticated color palette (deep muted colors)
palette = sns.color_palette("viridis", n_colors=len(most_common_phonemes))
# Plot the phoneme frequencies
bars = sns.barplot(x=phonemes, y=counts, palette=palette)
# Add annotations to each bar
for index, value in enumerate(counts):
bars.text(index, value + max(counts)*0.02, f'{value} ({value/sum(counts)*100:.1f}%)', color='black', ha="center", va="bottom", fontsize=12)
# Set title, xlabel, ylabel and adjust font sizes
plt.title('Top 10 Phoneme Frequencies', fontsize=22, fontweight='bold', pad=20)
plt.xlabel('Phoneme', fontsize=18)
plt.ylabel('Frequency', fontsize=18)
plt.xticks(fontsize=16)
plt.yticks(fontsize=16)
# Ensure the plot layout is organized
plt.tight_layout()
# Show the plot
plt.show()
10 Most Common Phonemes: <space>: 299529 AH: 111029 T: 91599 N: 77726 IH: 75183 R: 52083 S: 50329 D: 47510 <sos>: 45814 <eos>: 45814
In [ ]:
# Concatenate all lists of phonemes and create a Counter object
all_phonemes = [phoneme for sublist in df['phonemes'] for phoneme in sublist]
phoneme_freq = Counter(all_phonemes)
# Get all unique phonemes
unique_phonemes = list(phoneme_freq.keys())
unique_phonemes
Out[ ]:
['<sos>', 'W', 'EH', 'N', '<space>', 'Y', 'UW', 'AA', 'R', 'K', 'UH', 'IH', 'NG', 'CH', 'P', 'S', 'AE', 'T', 'HH', 'OW', 'M', '<eos>', 'DH', 'AH', 'D', 'SH', 'L', 'AO', 'F', 'EY', 'Z', 'TH', 'AY', 'B', 'V', 'AW', 'G', 'ER', 'IY', 'JH', 'OY', 'ZH']
In [ ]:
# Define viseme categories
viseme_dict = {
'aa': ['aa', 'aw', 'ay'], 'ah': ['ah'], 'ao': ['ao', 'oy', 'ow'],
'ch': ['jh', 'ch', 'sh', 'zh'], 'er': ['er'], 'ey': ['eh', 'ey', 'ae'],
'f': ['f', 'v'], 'iy': ['ih', 'iy'], 'k': ['k', 'g', 'ng', 'n'],
'p': ['p', 'b', 'm'], 't': ['t', 'd', 's', 'z', 'th', 'dh'],
'uh': ['uh', 'uw'], 'w': ['w', 'r', 'l', 'y', 'hh'],
'space': ['<space>'], 'sos': ['<sos>'], 'eos': ['<eos>']
}
phoneme_to_viseme = {phoneme: viseme for viseme, phonemes in viseme_dict.items() for phoneme in phonemes}
def phonemes_to_visemes(phonemes):
visemes = []
for phoneme in phonemes:
if phoneme in ['<sos>', '<eos>', '<space>']:
visemes.append(phoneme)
else:
phoneme = phoneme[:-1] if phoneme[-1].isdigit() else phoneme
viseme = phoneme_to_viseme.get(phoneme, 'unknown')
visemes.append(viseme)
return visemes
# Example DataFrame
df_check = pd.DataFrame({
'phonemes': [['<sos>', 'W', 'EH', 'N', '<space>', 'Y', 'UW', 'K', 'UH', 'IH', 'NG', 'CH', 'P', 'S', 'AE', 'T', 'HH', 'OW', 'M', '<eos>', 'DH', 'AH', 'R', 'D', 'SH', 'L', 'AO', 'F', 'EY', 'Z', 'AA', 'TH', 'AY', 'B', 'V', 'AW', 'G', 'ER', 'IY', 'JH', 'OY', 'ZH']]
})
# Convert phonemes to lowercase
df['phonemes'] = df['phonemes'].apply(lambda phonemes: [phoneme.lower() for phoneme in phonemes])
# Convert phonemes to visemes in df_expanded
df['visemes'] = df['phonemes'].apply(phonemes_to_visemes)
# Print the first few rows to check the results
print(df[['phonemes', 'visemes']].head())
# Visual Inspection
print(df[['phonemes', 'visemes']].sample(5))
# Mapping Consistency
phoneme_to_viseme = {}
inconsistencies = 0
for phonemes, visemes in zip(df['phonemes'], df['visemes']):
for phoneme, viseme in zip(phonemes, visemes):
phoneme = phoneme[:-1] if phoneme[-1].isdigit() else phoneme
if phoneme in phoneme_to_viseme:
if phoneme_to_viseme[phoneme] != viseme:
inconsistencies += 1
else:
phoneme_to_viseme[phoneme] = viseme
print(f'Number of inconsistencies in mapping: {inconsistencies}')
# Usage of Unknown Visemes
unknown_visemes_count = df['visemes'].apply(lambda x: x.count('unknown')).sum()
print(f'Number of unknown visemes: {unknown_visemes_count}')
phonemes \ 0 [<sos>, w, eh, n, <space>, y, uw, <space>, aa,... 1 [<sos>, dh, ah, <space>, t, r, ah, d, ih, sh, ... 2 [<sos>, th, r, uw, <space>, w, ah, t, <space>,... 3 [<sos>, w, ih, ch, <space>, ih, n, v, aa, l, v... 4 [<sos>, ah, p, aa, r, t, <space>, f, r, ah, m,... visemes 0 [<sos>, w, ey, k, <space>, w, uh, <space>, aa,... 1 [<sos>, t, ah, <space>, t, w, ah, t, iy, ch, a... 2 [<sos>, t, w, uh, <space>, w, ah, t, <space>, ... 3 [<sos>, w, iy, ch, <space>, iy, k, f, aa, w, f... 4 [<sos>, ah, p, aa, w, t, <space>, f, w, ah, p,... phonemes \ 16950 [<sos>, ih, z, <space>, n, aa, t, <space>, dh,... 9165 [<sos>, ae, z, <space>, dh, ah, <space>, d, ae... 17444 [<sos>, ih, t, <space>, w, aa, z, <space>, hh,... 1656 [<sos>, y, uw, <space>, hh, ae, v, <space>, g,... 40670 [<sos>, w, iy, <space>, aa, r, <space>, ae, s,... visemes 16950 [<sos>, iy, t, <space>, k, aa, t, <space>, t, ... 9165 [<sos>, ey, t, <space>, t, ah, <space>, t, ey,... 17444 [<sos>, iy, t, <space>, w, aa, t, <space>, w, ... 1656 [<sos>, w, uh, <space>, w, ey, f, <space>, k, ... 40670 [<sos>, w, iy, <space>, aa, w, <space>, ey, t,... Number of inconsistencies in mapping: 0 Number of unknown visemes: 0
In [ ]:
# Set display options
pd.set_option('display.max_rows', 5)
pd.set_option('display.max_colwidth', None)
# Display the first 5 rows
display(df[['phonemes', 'visemes']].head())
phonemes | visemes | |
---|---|---|
0 | [<sos>, w, eh, n, <space>, y, uw, <space>, aa, r, <space>, k, uh, k, ih, ng, <space>, ch, ih, p, s, <space>, ae, t, <space>, hh, ow, m, <eos>] | [<sos>, w, ey, k, <space>, w, uh, <space>, aa, w, <space>, k, uh, k, iy, k, <space>, ch, iy, p, t, <space>, ey, t, <space>, w, ao, p, <eos>] |
1 | [<sos>, dh, ah, <space>, t, r, ah, d, ih, sh, ah, n, ah, l, <space>, ch, ih, p, <space>, p, ae, n, <space>, ao, f, ah, n, <space>, s, t, ey, z, <space>, aa, n, <space>, dh, ah, <space>, sh, eh, l, f, <eos>] | [<sos>, t, ah, <space>, t, w, ah, t, iy, ch, ah, k, ah, w, <space>, ch, iy, p, <space>, p, ey, k, <space>, ao, f, ah, k, <space>, t, t, ey, t, <space>, aa, k, <space>, t, ah, <space>, ch, ey, w, f, <eos>] |
2 | [<sos>, th, r, uw, <space>, w, ah, t, <space>, dh, ey, <space>, k, ao, l, <space>, ah, <space>, n, ay, f, <space>, b, l, aa, k, <eos>] | [<sos>, t, w, uh, <space>, w, ah, t, <space>, t, ey, <space>, k, ao, w, <space>, ah, <space>, k, aa, f, <space>, p, w, aa, k, <eos>] |
3 | [<sos>, w, ih, ch, <space>, ih, n, v, aa, l, v, z, <space>, f, ay, r, ih, ng, <space>, ah, <space>, p, ah, t, ey, t, ow, <space>, d, aw, n, <space>, ah, <space>, p, ay, p, <eos>] | [<sos>, w, iy, ch, <space>, iy, k, f, aa, w, f, t, <space>, f, aa, w, iy, k, <space>, ah, <space>, p, ah, t, ey, t, ao, <space>, t, aa, k, <space>, ah, <space>, p, aa, p, <eos>] |
4 | [<sos>, ah, p, aa, r, t, <space>, f, r, ah, m, <space>, dh, ah, <space>, g, ow, l, d, ah, n, <space>, k, ah, l, aw, r, <space>, ah, n, d, <space>, dh, ah, <space>, d, ih, l, ih, sh, ah, s, <space>, f, l, ae, v, er, <eos>] | [<sos>, ah, p, aa, w, t, <space>, f, w, ah, p, <space>, t, ah, <space>, k, ao, w, t, ah, k, <space>, k, ah, w, aa, w, <space>, ah, k, t, <space>, t, ah, <space>, t, iy, w, iy, ch, ah, t, <space>, f, w, ey, f, er, <eos>] |
In [ ]:
# Calculate the distribution of visemes in the dataset
viseme_distribution = pd.Series([item for sublist in df['visemes'] for item in sublist]).value_counts()
# Set up the visualization parameters
sns.set_style("whitegrid")
sns.set_palette("coolwarm_r")
sns.set_context("talk")
# Calculate the percentage of each viseme in the dataset
viseme_percentage = (viseme_distribution / viseme_distribution.sum()) * 100
# Create a horizontal bar plot for the visemes
plt.figure(figsize=(14, 10))
ax = sns.barplot(y=viseme_distribution.index, x=viseme_distribution.values, orient="h", palette="viridis")
# Annotate each bar with the count and percentage of each viseme
for index, value in enumerate(viseme_distribution.values):
ax.text(value, index,
f'{value} ({viseme_percentage[index]:.1f}%)',
color='black', ha="left", va="center", fontsize=10)
plt.title('Distribution of Visemes in the Dataset', fontsize=16, fontweight='bold')
plt.ylabel('Viseme', fontsize=14)
plt.xlabel('Count', fontsize=14)
plt.show()
In [ ]:
# Extract unique phonemes and visemes from the dataframe
unique_phonemes = set([item for sublist in df['phonemes'] for item in sublist])
unique_visemes = set([item for sublist in df['visemes'] for item in sublist])
# Exclude the special tokens from the filtered list
exclude_tokens = ['<space>', '<sos>', '<eos>', 'space', 'sos', 'eos']
filtered_phonemes = [phoneme for phoneme in unique_phonemes if phoneme not in exclude_tokens]
filtered_visemes = [viseme for viseme in unique_visemes if viseme not in exclude_tokens]
# Efficiently indexing the confusion matrix
phoneme_index = {phoneme: idx for idx, phoneme in enumerate(filtered_phonemes)}
viseme_index = {viseme: idx for idx, viseme in enumerate(filtered_visemes)}
# Create a matrix for the filtered phonemes and visemes
confusion_matrix = np.zeros((len(filtered_phonemes), len(filtered_visemes)))
# Update the matrix based on the mappings in the dataset
for phonemes, visemes in zip(df['phonemes'], df['visemes']):
for phoneme, viseme in zip(phonemes, visemes):
if phoneme in phoneme_index and viseme in viseme_index:
i = phoneme_index[phoneme]
j = viseme_index[viseme]
confusion_matrix[i][j] += 1
# Plot the heatmap
plt.figure(figsize=(14, 10))
ax = sns.heatmap(confusion_matrix, annot=True, fmt=".0f", cmap="Blues",
xticklabels=filtered_visemes, yticklabels=filtered_phonemes,
annot_kws={"size": 12})
plt.title("Phoneme to Viseme Mapping Heatmap", fontsize=18, fontweight='bold')
plt.xlabel("Viseme", fontsize=16)
plt.ylabel("Phoneme", fontsize=16)
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)
plt.tight_layout()
plt.show()
In [ ]:
# Check length consistency between phonemes and visemes
length_consistency = df['phonemes'].str.len().equals(df['visemes'].str.len())
print(f'Length consistency: {length_consistency}')
# Calculate lengths
df['phoneme_length'] = df['phonemes'].apply(len)
df['viseme_length'] = df['visemes'].apply(len)
# Find mismatches
mismatches = df[df['phoneme_length'] != df['viseme_length']]
# Print the sentences, phonemes, and visemes for those rows
for _, row in mismatches.head().iterrows():
print(f"Sentence: {row['expanded_sentence']}")
print(f"Phonemes: {' '.join(row['phonemes'])}")
print(f"Visemes: {' '.join(row['visemes'])}")
print(f"Phoneme Length: {row['phoneme_length']}")
print(f"Viseme Length: {row['viseme_length']}\n")
# Display a sample of sentences, phonemes, and visemes for comparison
sample_comparison = df[['sentence', 'phonemes', 'visemes']].sample(5)
for _, row in sample_comparison.iterrows():
print(f"Sentence: {row['sentence']}")
print(f"Phonemes: {' '.join(row['phonemes'])}")
print(f"Visemes: {' '.join(row['visemes'])}\n")
Length consistency: True Sentence: we made some big bales as well Phonemes: <sos> w iy <space> m ey d <space> s ah m <space> b ih g <space> b ey l z <space> ae z <space> w eh l <eos> Visemes: <sos> w iy <space> p ey t <space> t ah p <space> p iy k <space> p ey w t <space> ey t <space> w ey w <eos> Sentence: they had places in london all through their lives too Phonemes: <sos> dh ey <space> hh ae d <space> p l ey s ah z <space> ih n <space> l ah n d ah n <space> ao l <space> th r uw <space> dh eh r <space> l ih v z <space> t uw <eos> Visemes: <sos> t ey <space> w ey t <space> p w ey t ah t <space> iy k <space> w ah k t ah k <space> ao w <space> t w uh <space> t ey w <space> w iy f t <space> t uh <eos> Sentence: the metropolitan cathedral Phonemes: <sos> dh ah <space> m eh t r ah p aa l ah t ah n <space> k ah th iy d r ah l <eos> Visemes: <sos> t ah <space> p ey t w ah p aa w ah t ah k <space> k ah t iy t w ah w <eos> Sentence: it gave that hint of sexuality Phonemes: <sos> ih t <space> g ey v <space> dh ae t <space> hh ih n t <space> ah v <space> s eh k sh uw ae l ah t iy <eos> Visemes: <sos> iy t <space> k ey f <space> t ey t <space> w iy k t <space> ah f <space> t ey k ch uh ey w ah t iy <eos> Sentence: we have our final two to play Phonemes: <sos> w iy <space> hh ae v <space> aw er <space> f ay n ah l <space> t uw <space> t uw <space> p l ey <eos> Visemes: <sos> w iy <space> w ey f <space> aa er <space> f aa k ah w <space> t uh <space> t uh <space> p w ey <eos>
In [ ]:
# Display Sample Comparisons
sample_df = df.sample(5)
for index, row in sample_df.iterrows():
print(f"Sentence {index + 1}: {row['sentence']}")
print(f"Phonemes: {' '.join(row['phonemes'])}")
print(f"Visemes: {' '.join(row['visemes'])}\n")
Sentence 14447: and it goes really high and every night Phonemes: <sos> ah n d <space> ih t <space> g ow z <space> r ih l iy <space> hh ay <space> ah n d <space> eh v er iy <space> n ay t <eos> Visemes: <sos> ah k t <space> iy t <space> k ao t <space> w iy w iy <space> w aa <space> ah k t <space> ey f er iy <space> k aa t <eos> Sentence 41803: she was very sensitive to the fact that monarchs could be replaced by this method Phonemes: <sos> sh iy <space> w aa z <space> v eh r iy <space> s eh n s ah t ih v <space> t uw <space> dh ah <space> f ae k t <space> dh ae t <space> m aa n aa r k s <space> k uh d <space> b iy <space> r iy p l ey s t <space> b ay <space> dh ih s <space> m eh th ah d <eos> Visemes: <sos> ch iy <space> w aa t <space> f ey w iy <space> t ey k t ah t iy f <space> t uh <space> t ah <space> f ey k t <space> t ey t <space> p aa k aa w k t <space> k uh t <space> p iy <space> w iy p w ey t t <space> p aa <space> t iy t <space> p ey t ah t <eos> Sentence 35827: i do not belong to any club Phonemes: <sos> ay <space> d uw <space> n aa t <space> b ih l ao ng <space> t uw <space> eh n iy <space> k l ah b <eos> Visemes: <sos> aa <space> t uh <space> k aa t <space> p iy w ao k <space> t uh <space> ey k iy <space> k w ah p <eos> Sentence 45508: what can be done to help farmers like james Phonemes: <sos> w ah t <space> k ae n <space> b iy <space> d ah n <space> t uw <space> hh eh l p <space> f aa r m er z <space> l ay k <space> jh ey m z <eos> Visemes: <sos> w ah t <space> k ey k <space> p iy <space> t ah k <space> t uh <space> w ey w p <space> f aa w p er t <space> w aa k <space> ch ey p t <eos> Sentence 24063: and dirac did not like to speak in french Phonemes: <sos> ah n d <space> d ih r ah k <space> d ih d <space> n aa t <space> l ay k <space> t uw <space> s p iy k <space> ih n <space> f r eh n ch <eos> Visemes: <sos> ah k t <space> t iy w ah k <space> t iy t <space> k aa t <space> w aa k <space> t uh <space> t p iy k <space> iy k <space> f w ey k ch <eos>
In [ ]:
import os
# Store the original directory
original_directory = os.getcwd()
# Change to the directory where phonemes.txt is located
os.chdir('/content/drive/MyDrive/Dissertation/')
# Revert back to the original directory
os.chdir(original_directory)
In [ ]:
print(df.columns)
Index(['sentence', 'word_count', 'original_sentence', 'word_count_after_conversion', 'changed', 'phonemes', 'num_spaces', 'phonemes_str', 'sentence_with_tokens', 'phoneme_count', 'tokenized_sentence', 'visemes', 'phoneme_length', 'viseme_length'], dtype='object')
In [ ]:
df.head()
Out[ ]:
sentence | word_count | original_sentence | word_count_after_conversion | changed | phonemes | num_spaces | phonemes_str | sentence_with_tokens | phoneme_count | tokenized_sentence | visemes | phoneme_length | viseme_length | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | when you are cooking chips at home | 7 | WHEN YOU'RE COOKING CHIPS AT HOME | 6 | False | [<sos>, w, eh, n, <space>, y, uw, <space>, aa, r, <space>, k, uh, k, ih, ng, <space>, ch, ih, p, s, <space>, ae, t, <space>, hh, ow, m, <eos>] | 6 | <sos> W EH N <space> Y UW <space> AA R <space> K UH K IH NG <space> CH IH P S <space> AE T <space> HH OW M <eos> | <sos> when <space> you <space> are <space> cooking <space> chips <space> at <space> home <eos> | 29 | <sos> when <space> you <space> are <space> cooking <space> chips <space> at <space> home <eos> | [<sos>, w, ey, k, <space>, w, uh, <space>, aa, w, <space>, k, uh, k, iy, k, <space>, ch, iy, p, t, <space>, ey, t, <space>, w, ao, p, <eos>] | 29 | 29 |
1 | the traditional chip pan often stays on the shelf | 9 | THE TRADITIONAL CHIP PAN OFTEN STAYS ON THE SHELF | 9 | False | [<sos>, dh, ah, <space>, t, r, ah, d, ih, sh, ah, n, ah, l, <space>, ch, ih, p, <space>, p, ae, n, <space>, ao, f, ah, n, <space>, s, t, ey, z, <space>, aa, n, <space>, dh, ah, <space>, sh, eh, l, f, <eos>] | 8 | <sos> DH AH <space> T R AH D IH SH AH N AH L <space> CH IH P <space> P AE N <space> AO F AH N <space> S T EY Z <space> AA N <space> DH AH <space> SH EH L F <eos> | <sos> the <space> traditional <space> chip <space> pan <space> often <space> stays <space> on <space> the <space> shelf <eos> | 44 | <sos> the <space> traditional <space> chip <space> pan <space> often <space> stays <space> on <space> the <space> shelf <eos> | [<sos>, t, ah, <space>, t, w, ah, t, iy, ch, ah, k, ah, w, <space>, ch, iy, p, <space>, p, ey, k, <space>, ao, f, ah, k, <space>, t, t, ey, t, <space>, aa, k, <space>, t, ah, <space>, ch, ey, w, f, <eos>] | 44 | 44 |
2 | through what they call a knife block | 7 | THROUGH WHAT THEY CALL A KNIFE BLOCK | 7 | False | [<sos>, th, r, uw, <space>, w, ah, t, <space>, dh, ey, <space>, k, ao, l, <space>, ah, <space>, n, ay, f, <space>, b, l, aa, k, <eos>] | 6 | <sos> TH R UW <space> W AH T <space> DH EY <space> K AO L <space> AH <space> N AY F <space> B L AA K <eos> | <sos> through <space> what <space> they <space> call <space> a <space> knife <space> block <eos> | 27 | <sos> through <space> what <space> they <space> call <space> a <space> knife <space> block <eos> | [<sos>, t, w, uh, <space>, w, ah, t, <space>, t, ey, <space>, k, ao, w, <space>, ah, <space>, k, aa, f, <space>, p, w, aa, k, <eos>] | 27 | 27 |
3 | which involves firing a potato down a pipe | 8 | WHICH INVOLVES FIRING A POTATO DOWN A PIPE | 8 | False | [<sos>, w, ih, ch, <space>, ih, n, v, aa, l, v, z, <space>, f, ay, r, ih, ng, <space>, ah, <space>, p, ah, t, ey, t, ow, <space>, d, aw, n, <space>, ah, <space>, p, ay, p, <eos>] | 7 | <sos> W IH CH <space> IH N V AA L V Z <space> F AY R IH NG <space> AH <space> P AH T EY T OW <space> D AW N <space> AH <space> P AY P <eos> | <sos> which <space> involves <space> firing <space> a <space> potato <space> down <space> a <space> pipe <eos> | 38 | <sos> which <space> involves <space> firing <space> a <space> potato <space> down <space> a <space> pipe <eos> | [<sos>, w, iy, ch, <space>, iy, k, f, aa, w, f, t, <space>, f, aa, w, iy, k, <space>, ah, <space>, p, ah, t, ey, t, ao, <space>, t, aa, k, <space>, ah, <space>, p, aa, p, <eos>] | 38 | 38 |
4 | apart from the golden colour and the delicious flavour | 9 | APART FROM THE GOLDEN COLOUR AND THE DELICIOUS FLAVOUR | 9 | False | [<sos>, ah, p, aa, r, t, <space>, f, r, ah, m, <space>, dh, ah, <space>, g, ow, l, d, ah, n, <space>, k, ah, l, aw, r, <space>, ah, n, d, <space>, dh, ah, <space>, d, ih, l, ih, sh, ah, s, <space>, f, l, ae, v, er, <eos>] | 8 | <sos> AH P AA R T <space> F R AH M <space> DH AH <space> G OW L D AH N <space> K AH L AW R <space> AH N D <space> DH AH <space> D IH L IH SH AH S <space> F L AE V ER <eos> | <sos> apart <space> from <space> the <space> golden <space> colour <space> and <space> the <space> delicious <space> flavour <eos> | 49 | <sos> apart <space> from <space> the <space> golden <space> colour <space> and <space> the <space> delicious <space> flavour <eos> | [<sos>, ah, p, aa, w, t, <space>, f, w, ah, p, <space>, t, ah, <space>, k, ao, w, t, ah, k, <space>, k, ah, w, aa, w, <space>, ah, k, t, <space>, t, ah, <space>, t, iy, w, iy, ch, ah, t, <space>, f, w, ey, f, er, <eos>] | 49 | 49 |
In [ ]:
df.shape
Out[ ]:
(45700, 14)
In [ ]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
# Prepare the viseme data
viseme_tokenizer = Tokenizer(filters='', lower=False, split=' ')
viseme_tokenizer.fit_on_texts(df['visemes'])
viseme_sequences = viseme_tokenizer.texts_to_sequences(df['visemes'])
viseme_MAX_LEN = max(len(seq) for seq in viseme_sequences)
X_data = pad_sequences(viseme_sequences, maxlen=viseme_MAX_LEN, padding='post')
# Prepare the sentence data
sentence_tokenizer = Tokenizer(filters='!"#$%&()*+,-./:;=?@[\\]^_`{|}~\t\n')
sentence_tokenizer.fit_on_texts(df['sentence_with_tokens'])
sentence_sequences = sentence_tokenizer.texts_to_sequences(df['sentence_with_tokens'])
sentence_MAX_LEN = max(len(seq) for seq in sentence_sequences)
y_data = pad_sequences(sentence_sequences, maxlen=sentence_MAX_LEN, padding='post')
print("X_data:\n", X_data[:5])
print("\ny_data:\n", y_data[:5])
# Check if the special tokens <sos>, <space>, and <eos> are included in the tokenized sequences
special_tokens = ['<sos>', '<space>', '<eos>']
for token in special_tokens:
token_index = viseme_tokenizer.word_index[token]
token_in_X_data = any(token_index in seq for seq in X_data)
token_in_y_data = any(token_index in seq for seq in y_data)
print(f"\nIs '{token}' included in X_data? {token_in_X_data}")
print(f"Is '{token}' included in y_data? {token_in_y_data}")
X_data: [[10 3 7 4 1 3 14 1 9 3 1 4 14 4 5 4 1 16 5 8 2 1 7 2 1 3 13 8 11 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0] [10 2 6 1 2 3 6 2 5 16 6 4 6 3 1 16 5 8 1 8 7 4 1 13 12 6 4 1 2 2 7 2 1 9 4 1 2 6 1 16 7 3 12 11 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0] [10 2 3 14 1 3 6 2 1 2 7 1 4 13 3 1 6 1 4 9 12 1 8 3 9 4 11 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0] [10 3 5 16 1 5 4 12 9 3 12 2 1 12 9 3 5 4 1 6 1 8 6 2 7 2 13 1 2 9 4 1 6 1 8 9 8 11 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0] [10 6 8 9 3 2 1 12 3 6 8 1 2 6 1 4 13 3 2 6 4 1 4 6 3 9 3 1 6 4 2 1 2 6 1 2 5 3 5 16 6 2 1 12 3 7 12 15 11 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]] y_data: [[ 2 49 1 13 1 20 1 997 1 1629 1 38 1 145 3 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0] [ 2 4 1 1032 1 3014 1 4422 1 356 1 3334 1 22 1 4 1 4423 3 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0] [ 2 160 1 28 1 21 1 313 1 8 1 3015 1 2148 3 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0] [ 2 64 1 3817 1 3335 1 8 1 3818 1 115 1 8 1 4424 3 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0] [ 2 509 1 42 1 4 1 2036 1 1101 1 6 1 4 1 2149 1 1809 3 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]] Is '<sos>' included in X_data? True Is '<sos>' included in y_data? True Is '<space>' included in X_data? True Is '<space>' included in y_data? True Is '<eos>' included in X_data? True Is '<eos>' included in y_data? True
In [ ]:
# Seaborn Plot
sns.set_style("whitegrid")
sns.set_context("talk")
palette = ["#3498db", "#e74c3c"] # Blue and Red palette
# Boxplot Visualization
plt.figure(figsize=(16, 7))
# Boxplot for X_data (Viseme)
plt.subplot(1, 2, 1)
sns.boxplot(x=X_data.ravel(), color=palette[0])
plt.title('Boxplot of Encoded Values for Visemes', fontweight='bold')
plt.xlabel('Encoded Value', fontsize=14)
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)
# Boxplot for y_data (Sentences)
plt.subplot(1, 2, 2)
sns.boxplot(x=y_data.ravel(), color=palette[1])
plt.title('Boxplot of Encoded Values for Sentences', fontweight='bold')
plt.xlabel('Encoded Value', fontsize=14)
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)
plt.tight_layout()
plt.show()
In [ ]:
print(df.columns)
Index(['sentence', 'word_count', 'original_sentence', 'word_count_after_conversion', 'changed', 'phonemes', 'num_spaces', 'phonemes_str', 'sentence_with_tokens', 'phoneme_count', 'tokenized_sentence', 'visemes', 'phoneme_length', 'viseme_length'], dtype='object')
In [ ]:
df.head()
Out[ ]:
sentence | word_count | original_sentence | word_count_after_conversion | changed | phonemes | num_spaces | phonemes_str | sentence_with_tokens | phoneme_count | tokenized_sentence | visemes | phoneme_length | viseme_length | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | when you are cooking chips at home | 7 | WHEN YOU'RE COOKING CHIPS AT HOME | 6 | False | [<sos>, w, eh, n, <space>, y, uw, <space>, aa, r, <space>, k, uh, k, ih, ng, <space>, ch, ih, p, s, <space>, ae, t, <space>, hh, ow, m, <eos>] | 6 | <sos> W EH N <space> Y UW <space> AA R <space> K UH K IH NG <space> CH IH P S <space> AE T <space> HH OW M <eos> | <sos> when <space> you <space> are <space> cooking <space> chips <space> at <space> home <eos> | 29 | <sos> when <space> you <space> are <space> cooking <space> chips <space> at <space> home <eos> | [<sos>, w, ey, k, <space>, w, uh, <space>, aa, w, <space>, k, uh, k, iy, k, <space>, ch, iy, p, t, <space>, ey, t, <space>, w, ao, p, <eos>] | 29 | 29 |
1 | the traditional chip pan often stays on the shelf | 9 | THE TRADITIONAL CHIP PAN OFTEN STAYS ON THE SHELF | 9 | False | [<sos>, dh, ah, <space>, t, r, ah, d, ih, sh, ah, n, ah, l, <space>, ch, ih, p, <space>, p, ae, n, <space>, ao, f, ah, n, <space>, s, t, ey, z, <space>, aa, n, <space>, dh, ah, <space>, sh, eh, l, f, <eos>] | 8 | <sos> DH AH <space> T R AH D IH SH AH N AH L <space> CH IH P <space> P AE N <space> AO F AH N <space> S T EY Z <space> AA N <space> DH AH <space> SH EH L F <eos> | <sos> the <space> traditional <space> chip <space> pan <space> often <space> stays <space> on <space> the <space> shelf <eos> | 44 | <sos> the <space> traditional <space> chip <space> pan <space> often <space> stays <space> on <space> the <space> shelf <eos> | [<sos>, t, ah, <space>, t, w, ah, t, iy, ch, ah, k, ah, w, <space>, ch, iy, p, <space>, p, ey, k, <space>, ao, f, ah, k, <space>, t, t, ey, t, <space>, aa, k, <space>, t, ah, <space>, ch, ey, w, f, <eos>] | 44 | 44 |
2 | through what they call a knife block | 7 | THROUGH WHAT THEY CALL A KNIFE BLOCK | 7 | False | [<sos>, th, r, uw, <space>, w, ah, t, <space>, dh, ey, <space>, k, ao, l, <space>, ah, <space>, n, ay, f, <space>, b, l, aa, k, <eos>] | 6 | <sos> TH R UW <space> W AH T <space> DH EY <space> K AO L <space> AH <space> N AY F <space> B L AA K <eos> | <sos> through <space> what <space> they <space> call <space> a <space> knife <space> block <eos> | 27 | <sos> through <space> what <space> they <space> call <space> a <space> knife <space> block <eos> | [<sos>, t, w, uh, <space>, w, ah, t, <space>, t, ey, <space>, k, ao, w, <space>, ah, <space>, k, aa, f, <space>, p, w, aa, k, <eos>] | 27 | 27 |
3 | which involves firing a potato down a pipe | 8 | WHICH INVOLVES FIRING A POTATO DOWN A PIPE | 8 | False | [<sos>, w, ih, ch, <space>, ih, n, v, aa, l, v, z, <space>, f, ay, r, ih, ng, <space>, ah, <space>, p, ah, t, ey, t, ow, <space>, d, aw, n, <space>, ah, <space>, p, ay, p, <eos>] | 7 | <sos> W IH CH <space> IH N V AA L V Z <space> F AY R IH NG <space> AH <space> P AH T EY T OW <space> D AW N <space> AH <space> P AY P <eos> | <sos> which <space> involves <space> firing <space> a <space> potato <space> down <space> a <space> pipe <eos> | 38 | <sos> which <space> involves <space> firing <space> a <space> potato <space> down <space> a <space> pipe <eos> | [<sos>, w, iy, ch, <space>, iy, k, f, aa, w, f, t, <space>, f, aa, w, iy, k, <space>, ah, <space>, p, ah, t, ey, t, ao, <space>, t, aa, k, <space>, ah, <space>, p, aa, p, <eos>] | 38 | 38 |
4 | apart from the golden colour and the delicious flavour | 9 | APART FROM THE GOLDEN COLOUR AND THE DELICIOUS FLAVOUR | 9 | False | [<sos>, ah, p, aa, r, t, <space>, f, r, ah, m, <space>, dh, ah, <space>, g, ow, l, d, ah, n, <space>, k, ah, l, aw, r, <space>, ah, n, d, <space>, dh, ah, <space>, d, ih, l, ih, sh, ah, s, <space>, f, l, ae, v, er, <eos>] | 8 | <sos> AH P AA R T <space> F R AH M <space> DH AH <space> G OW L D AH N <space> K AH L AW R <space> AH N D <space> DH AH <space> D IH L IH SH AH S <space> F L AE V ER <eos> | <sos> apart <space> from <space> the <space> golden <space> colour <space> and <space> the <space> delicious <space> flavour <eos> | 49 | <sos> apart <space> from <space> the <space> golden <space> colour <space> and <space> the <space> delicious <space> flavour <eos> | [<sos>, ah, p, aa, w, t, <space>, f, w, ah, p, <space>, t, ah, <space>, k, ao, w, t, ah, k, <space>, k, ah, w, aa, w, <space>, ah, k, t, <space>, t, ah, <space>, t, iy, w, iy, ch, ah, t, <space>, f, w, ey, f, er, <eos>] | 49 | 49 |
In [ ]:
# Check the structure of the df['visemes'] column
print("First 5 entries in 'visemes' column:")
print(df['visemes'].head())
# Check the structure of the df['tokenized_sentence'] column
print("\nFirst 5 entries in 'tokenized_sentence' column:")
print(df['sentence_with_tokens'].head())
# Check if the special tokens <sos>, <space>, and <eos> are already included in the data
special_tokens = ['<sos>', '<space>', '<eos>']
for token in special_tokens:
token_in_visemes = df['visemes'].apply(lambda x: token in x).any()
token_in_tokenized_sentence = df['sentence_with_tokens'].apply(lambda x: token in x).any()
print(f"\nIs '{token}' included in 'visemes' column? {token_in_visemes}")
print(f"Is '{token}' included in 'tokenized_sentence' column? {token_in_tokenized_sentence}")
First 5 entries in 'visemes' column: 0 [<sos>, w, ey, k, <space>, w, uh, <space>, aa, w, <space>, k, uh, k, iy, k, <space>, ch, iy, p, t, <space>, ey, t, <space>, w, ao, p, <eos>] 1 [<sos>, t, ah, <space>, t, w, ah, t, iy, ch, ah, k, ah, w, <space>, ch, iy, p, <space>, p, ey, k, <space>, ao, f, ah, k, <space>, t, t, ey, t, <space>, aa, k, <space>, t, ah, <space>, ch, ey, w, f, <eos>] 2 [<sos>, t, w, uh, <space>, w, ah, t, <space>, t, ey, <space>, k, ao, w, <space>, ah, <space>, k, aa, f, <space>, p, w, aa, k, <eos>] 3 [<sos>, w, iy, ch, <space>, iy, k, f, aa, w, f, t, <space>, f, aa, w, iy, k, <space>, ah, <space>, p, ah, t, ey, t, ao, <space>, t, aa, k, <space>, ah, <space>, p, aa, p, <eos>] 4 [<sos>, ah, p, aa, w, t, <space>, f, w, ah, p, <space>, t, ah, <space>, k, ao, w, t, ah, k, <space>, k, ah, w, aa, w, <space>, ah, k, t, <space>, t, ah, <space>, t, iy, w, iy, ch, ah, t, <space>, f, w, ey, f, er, <eos>] Name: visemes, dtype: object First 5 entries in 'tokenized_sentence' column: 0 <sos> when <space> you <space> are <space> cooking <space> chips <space> at <space> home <eos> 1 <sos> the <space> traditional <space> chip <space> pan <space> often <space> stays <space> on <space> the <space> shelf <eos> 2 <sos> through <space> what <space> they <space> call <space> a <space> knife <space> block <eos> 3 <sos> which <space> involves <space> firing <space> a <space> potato <space> down <space> a <space> pipe <eos> 4 <sos> apart <space> from <space> the <space> golden <space> colour <space> and <space> the <space> delicious <space> flavour <eos> Name: sentence_with_tokens, dtype: object Is '<sos>' included in 'visemes' column? True Is '<sos>' included in 'tokenized_sentence' column? True Is '<space>' included in 'visemes' column? True Is '<space>' included in 'tokenized_sentence' column? True Is '<eos>' included in 'visemes' column? True Is '<eos>' included in 'tokenized_sentence' column? True
In [ ]:
# Prepare the viseme and sentence data
viseme_tokenizer = Tokenizer(filters='', lower=False, split=' ')
viseme_tokenizer.fit_on_texts(df['visemes'])
viseme_sequences = viseme_tokenizer.texts_to_sequences(df['visemes'])
viseme_MAX_LEN = max(len(seq) for seq in viseme_sequences)
X_data = pad_sequences(viseme_sequences, maxlen=viseme_MAX_LEN, padding='post')
sentence_tokenizer = Tokenizer(filters='!"#$%&()*+,-./:;=?@[\\]^_`{|}~\t\n')
sentence_tokenizer.fit_on_texts(df['sentence_with_tokens'])
sentence_sequences = sentence_tokenizer.texts_to_sequences(df['sentence_with_tokens'])
sentence_MAX_LEN = max(len(seq) for seq in sentence_sequences)
y_data = pad_sequences(sentence_sequences, maxlen=sentence_MAX_LEN, padding='post')
In [ ]:
# Calculate lengths for each sequence in viseme and sentence sequences
viseme_lengths = [len(seq) for seq in viseme_sequences]
sentence_lengths = [len(seq) for seq in sentence_sequences]
# Descriptive Statistics
print("=== Viseme Sequences ===")
print(f"Average Length: {np.mean(viseme_lengths)}")
print(f"Minimum Length: {np.min(viseme_lengths)}")
print(f"Maximum Length: {np.max(viseme_lengths)}")
print("\n")
print("=== Sentence Sequences ===")
print(f"Average Length: {np.mean(sentence_lengths)}")
print(f"Minimum Length: {np.min(sentence_lengths)}")
print(f"Maximum Length: {np.max(sentence_lengths)}")
print("\n")
# Token Frequency
viseme_freq = pd.Series([item for sublist in viseme_sequences for item in sublist]).value_counts()
sentence_freq = pd.Series([item for sublist in sentence_sequences for item in sublist]).value_counts()
print("=== Most Frequent Visemes ===")
print(viseme_freq.head(10))
print("\n")
print("=== Most Frequent Words ===")
print(sentence_freq.head(10))
print("\n")
# Special Tokens
for token in special_tokens:
viseme_token_count = sum([seq.count(viseme_tokenizer.word_index[token]) for seq in viseme_sequences])
sentence_token_count = sum([seq.count(sentence_tokenizer.word_index[token]) for seq in sentence_sequences])
print(f"Occurrences of '{token}' in viseme sequences: {viseme_token_count}")
print(f"Occurrences of '{token}' in sentence sequences: {sentence_token_count}")
print("\n")
=== Viseme Sequences === Average Length: 34.041969365426695 Minimum Length: 11 Maximum Length: 109 === Sentence Sequences === Average Length: 15.970196936542669 Minimum Length: 7 Maximum Length: 53 === Most Frequent Visemes === 1 297858 2 265845 ... 9 60570 10 45700 Length: 10, dtype: int64 === Most Frequent Words === 1 296369 2 45700 ... 9 7996 10 7692 Length: 10, dtype: int64 Occurrences of '<sos>' in viseme sequences: 45700 Occurrences of '<sos>' in sentence sequences: 45700 Occurrences of '<space>' in viseme sequences: 297858 Occurrences of '<space>' in sentence sequences: 296369 Occurrences of '<eos>' in viseme sequences: 45700 Occurrences of '<eos>' in sentence sequences: 45700
In [ ]:
# Split data into train and test sets
X_train, X_test, y_train, y_test, train_indices, test_indices = train_test_split(
X_data, y_data, range(len(X_data)), test_size=0.2, random_state=42
)
# Create TensorFlow Dataset objects
batch_size = 64
train_dataset = tf.data.Dataset.from_tensor_slices(((X_train, y_train), y_train)).batch(batch_size).prefetch(buffer_size=tf.data.AUTOTUNE)
test_dataset = tf.data.Dataset.from_tensor_slices(((X_test, y_test), y_test)).batch(batch_size).prefetch(buffer_size=tf.data.AUTOTUNE)
# Define the model
embedding_dim = 128
units = 256
# Encoder
encoder_inputs = Input(shape=(viseme_MAX_LEN,))
encoder_embedding_layer = Embedding(input_dim=len(viseme_tokenizer.word_index) + 1, output_dim=embedding_dim)
encoder_embedding = encoder_embedding_layer(encoder_inputs)
encoder_gru = GRU(units, return_sequences=True, return_state=True)
encoder_outputs, encoder_state = encoder_gru(encoder_embedding)
# Decoder
decoder_inputs = Input(shape=(sentence_MAX_LEN,))
decoder_embedding_layer = Embedding(input_dim=len(sentence_tokenizer.word_index) + 1, output_dim=embedding_dim)
decoder_embedding = decoder_embedding_layer(decoder_inputs)
decoder_gru = GRU(units, return_sequences=True)
decoder_outputs = decoder_gru(decoder_embedding, initial_state=encoder_state)
# Attention
attention = Attention()
context_vector = attention([decoder_outputs, encoder_outputs])
# Concatenate context vector and decoder output
decoder_combined = tf.concat([context_vector, decoder_outputs], axis=-1)
# Dense layer
decoder_dense = Dense(len(sentence_tokenizer.word_index) + 1, activation='softmax')
decoder_outputs = decoder_dense(decoder_combined)
# Model
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
model.compile(optimizer=Adam(), loss='sparse_categorical_crossentropy', metrics=['accuracy'])
# Define early stopping callback
early_stopping_callback = EarlyStopping(monitor='val_loss', patience=5)
# Train the model
history = model.fit(
[X_train, y_train],
y_train,
batch_size=batch_size,
epochs=5,
validation_data=([X_test, y_test], y_test),
callbacks=[early_stopping_callback]
)
# Evaluate the model on the test set
test_loss, test_acc = model.evaluate([X_test, y_test], y_test)
print("Test Loss:", test_loss)
print("Test Accuracy:", test_acc)
Epoch 1/5 572/572 [==============================] - 376s 651ms/step - loss: 1.1226 - accuracy: 0.8587 - val_loss: 0.5126 - val_accuracy: 0.9333 Epoch 2/5 572/572 [==============================] - 370s 647ms/step - loss: 0.3519 - accuracy: 0.9546 - val_loss: 0.2554 - val_accuracy: 0.9695 Epoch 3/5 572/572 [==============================] - 365s 638ms/step - loss: 0.1824 - accuracy: 0.9781 - val_loss: 0.1546 - val_accuracy: 0.9835 Epoch 4/5 572/572 [==============================] - 365s 637ms/step - loss: 0.1049 - accuracy: 0.9875 - val_loss: 0.1117 - val_accuracy: 0.9891 Epoch 5/5 572/572 [==============================] - 363s 635ms/step - loss: 0.0640 - accuracy: 0.9920 - val_loss: 0.0937 - val_accuracy: 0.9916 286/286 [==============================] - 49s 172ms/step - loss: 0.0937 - accuracy: 0.9916 Test Loss: 0.09373828768730164 Test Accuracy: 0.9915651679039001
In [ ]:
# 1. Advanced Training and Validation Loss Curve
plt.figure(figsize=(12, 6))
plt.plot(history.history['loss'], label='Training Loss', color='blue', linestyle='--')
plt.plot(history.history['val_loss'], label='Validation Loss', color='red')
plt.scatter(np.argmin(history.history['val_loss']), min(history.history['val_loss']), s=100, c='red', marker='o')
plt.title('Advanced Training and Validation Loss over Epochs')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.grid(True, which='both', linestyle='--', linewidth=0.5)
plt.tight_layout()
plt.show()
# 2. Advanced Training and Validation Accuracy Curve
plt.figure(figsize=(12, 6))
plt.plot(history.history['accuracy'], label='Training Accuracy', color='blue', linestyle='--')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy', color='purple')
plt.scatter(np.argmax(history.history['val_accuracy']), max(history.history['val_accuracy']), s=100, c='purple', marker='o')
plt.title('Advanced Training and Validation Accuracy over Epochs')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()
plt.grid(True, which='both', linestyle='--', linewidth=0.5)
plt.tight_layout()
plt.show()
# 3. Model Architecture Visualization
plot_model(model, to_file='advanced_model_plot.png', show_shapes=True, show_layer_names=True, expand_nested=True)
plt.figure(figsize=(20, 20))
img = plt.imread('advanced_model_plot.png')
plt.imshow(img)
plt.axis('off')
plt.title('Advanced Model Architecture Visualization')
plt.show()
# 4. Advanced Final Test Loss and Accuracy
plt.figure(figsize=(10, 6))
bar_width = 0.35
index = np.arange(2)
bars1 = [test_loss, history.history['val_loss'][-1]]
bars2 = [test_acc, history.history['val_accuracy'][-1]]
rects1 = plt.bar(index, bars1, bar_width, label='Test', color='blue', alpha=0.8)
rects2 = plt.bar(index + bar_width, bars2, bar_width, label='Validation (Final Epoch)', color='green', alpha=0.8)
plt.xlabel('Metrics')
plt.ylabel('Values')
plt.title('Test Loss and Accuracy vs. Validation Metrics')
plt.xticks(index + bar_width / 2, ('Loss', 'Accuracy'))
plt.legend()
plt.tight_layout()
plt.show()
In [ ]:
print(test_indices[:10])
print([df['visemes'].iloc[idx] for idx in test_indices[:10]])
[29468, 44953, 23062, 43578, 2182, 26391, 21035, 35684, 39308, 1911] [['<sos>', 'w', 'iy', '<space>', 'k', 'iy', 't', '<space>', 'w', 'aa', 'w', 't', '<space>', 'f', 'ey', 'k', 't', 't', '<eos>'], ['<sos>', 't', 'ey', 't', '<space>', 'p', 'iy', 'k', 't', '<space>', 'f', 'ao', 'w', '<space>', 't', 'ah', '<space>', 'f', 'er', 't', 't', '<space>', 't', 'aa', 'p', '<space>', 't', 'iy', 'k', 't', '<space>', 'w', 'iy', '<space>', 'ch', 'ao', 'k', 't', '<space>', 't', 'ah', '<space>', 'w', 'uh', '<eos>'], ['<sos>', 'aa', '<space>', 't', 'iy', 'k', 'k', '<space>', 't', 'iy', 't', '<space>', 'iy', 't', '<space>', 'ah', '<space>', 'w', 'iy', 'w', 'iy', '<space>', 'k', 'uh', 't', '<space>', 'w', 'iy', 't', 'ah', 'w', '<space>', 'f', 'w', 'ey', 't', '<eos>'], ['<sos>', 'p', 'ey', 't', 't', '<space>', 'aa', 'k', '<space>', 't', 'ah', '<space>', 'f', 'ey', 'k', 't', '<space>', 'aa', '<space>', 'w', 'ey', 'f', '<space>', 'w', 'er', 'k', 't', '<space>', 'w', 'iy', 't', '<space>', 'w', 'w', 'uh', '<space>', 'f', 'ao', 'w', '<space>', 'w', 'ao', 'k', 'k', 'er', '<eos>'], ['<sos>', 't', 'ah', '<space>', 'ah', 't', 'er', 't', '<space>', 'ao', 'k', 'w', 'iy', '<space>', 'k', 'ey', 't', '<space>', 'k', 'ey', 'p', 'ah', 'k', '<eos>'], ['<sos>', 'iy', 't', '<space>', 'iy', 't', '<space>', 'k', 'aa', 't', '<space>', 'ao', 'f', 'ah', 'k', '<space>', 'aa', '<space>', 'w', 'uh', 't', '<space>', 'p', 'aa', '<space>', 't', 'ey', 'p', 'p', 'er', '<eos>'], ['<sos>', 't', 'iy', 't', '<space>', 'w', 'aa', 't', '<space>', 't', 'ah', '<space>', 't', 'aa', 'p', '<space>', 'iy', 'k', '<space>', 'w', 'iy', 'ch', '<eos>'], ['<sos>', 't', 'ah', '<space>', 'ch', 'ao', '<space>', 'w', 'ey', 'w', '<space>', 'ah', '<space>', 't', 'iy', 'p', '<space>', 'ah', 'f', '<space>', 'f', 'aa', 'f', '<space>', 'k', 'w', 'iy', 't', '<space>', 'ch', 'ey', 'w', 'ah', 'k', 'ch', 'er', 't', '<space>', 'p', 'iy', 't', '<space>', 't', 'ey', 'w', '<space>', 'w', 'iy', 't', 't', '<space>', 'ah', 'k', 'ey', 'k', 't', 't', '<space>', 'p', 'aa', 't', 'ah', 'p', 'w', 'iy', '<space>', 't', 'ah', '<space>', 'k', 'w', 'ey', 't', 'ah', 't', 't', '<space>', 'k', 'w', 'iy', 't', '<eos>'], ['<sos>', 'ah', 'k', 't', '<space>', 't', 'ao', '<space>', 'w', 'iy', '<space>', 'w', 'ey', 'f', '<space>', 't', 'w', 'aa', 't', '<space>', 't', 'uh', '<space>', 'p', 'iy', '<space>', 'k', 'uh', 't', '<space>', 'p', 'ey', 'k', 'er', 't', '<space>', 'p', 'aa', '<space>', 't', 'ey', 'iy', 'k', '<space>', 't', 'ey', 'k', '<eos>'], ['<sos>', 'w', 'aa', 'w', '<space>', 'ey', 'k', 'iy', '<space>', 'ey', 't', '<space>', 't', 't', 'iy', 'w', '<space>', 'ah', '<space>', 'p', 'ey', 'p', '<eos>']]
In [ ]:
# Function to convert predicted token IDs to text
def sequences_to_texts(sequences, tokenizer):
texts = tokenizer.sequences_to_texts(sequences)
return texts
# Select a subset from the test data for evaluation
num_examples = 15
X_test_subset = X_test[:num_examples]
y_test_subset = y_test[:num_examples]
original_visemes = df['visemes'].tolist()
# Get the visemes for the selected test subset
test_visemes_subset = [original_visemes[idx] for idx in test_indices[:num_examples]]
# Generate predictions on the subset of the test set
predictions = model.predict([X_test_subset, y_test_subset])
# Convert predicted token IDs to text
predicted_sentences = sequences_to_texts(predictions.argmax(axis=-1), sentence_tokenizer)
# Convert original token IDs to text
original_sentences = sequences_to_texts(y_test_subset, sentence_tokenizer)
# Initialize WER, BLEU, and CER scores
wer_scores = []
bleu_scores = []
# Print the original sentences, predicted sentences, and visemes side by side
for original, predicted, viseme_seq in zip(original_sentences, predicted_sentences, test_visemes_subset):
viseme_seq_text = ' '.join(viseme_seq)
print(f"Original: {original}\nPredicted: {predicted}\nVisemes: {viseme_seq_text}")
# Calculate WER
wer = jiwer.wer(original, predicted)
wer_scores.append(wer)
print(f"WER: {wer:.4f}")
# Tokenize sentences for BLEU score calculation
original_tokens = original.split()
predicted_tokens = predicted.split()
# Calculate BLEU score
bleu_score = corpus_bleu([[original_tokens]], [predicted_tokens], smoothing_function=SmoothingFunction().method3)
bleu_scores.append(bleu_score)
print(f"BLEU Score: {bleu_score:.4f}")
print("-" * 50)
# Calculate average WER and BLEU scores
average_wer = sum(wer_scores) / len(wer_scores)
average_bleu = sum(bleu_scores) / len(bleu_scores)
print(f"Average WER: {average_wer:.4f}")
print(f"Average BLEU Score: {average_bleu:.4f}")
1/1 [==============================] - 1s 812ms/step Original: <sos> we <space> need <space> hard <space> facts <eos> Predicted: <sos> we <space> need <space> hard <space> facts <eos> Visemes: <sos> w iy <space> k iy t <space> w aa w t <space> f ey k t t <eos> WER: 0.0000 BLEU Score: 1.0000 -------------------------------------------------- Original: <sos> that <space> means <space> for <space> the <space> first <space> time <space> since <space> we <space> joined <space> the <space> eu <eos> Predicted: <sos> that <space> means <space> for <space> the <space> first <space> time <space> since <space> we <space> joined <space> the <space> eu <eos> Visemes: <sos> t ey t <space> p iy k t <space> f ao w <space> t ah <space> f er t t <space> t aa p <space> t iy k t <space> w iy <space> ch ao k t <space> t ah <space> w uh <eos> WER: 0.0000 BLEU Score: 1.0000 -------------------------------------------------- Original: <sos> i <space> think <space> this <space> is <space> a <space> really <space> good <space> little <space> flat <eos> Predicted: <sos> i <space> think <space> this <space> is <space> a <space> really <space> good <space> little <space> flat <eos> Visemes: <sos> aa <space> t iy k k <space> t iy t <space> iy t <space> ah <space> w iy w iy <space> k uh t <space> w iy t ah w <space> f w ey t <eos> WER: 0.0000 BLEU Score: 1.0000 -------------------------------------------------- Original: <sos> based <space> on <space> the <space> fact <space> i <space> have <space> worked <space> with <space> hugh <space> for <space> longer <eos> Predicted: <sos> based <space> on <space> the <space> fact <space> i <space> have <space> worked <space> with <space> launch <space> for <space> longer <eos> Visemes: <sos> p ey t t <space> aa k <space> t ah <space> f ey k t <space> aa <space> w ey f <space> w er k t <space> w iy t <space> w w uh <space> f ao w <space> w ao k k er <eos> WER: 0.0435 BLEU Score: 0.8787 -------------------------------------------------- Original: <sos> the <space> others <space> only <space> get <space> gammon <eos> Predicted: <sos> the <space> others <space> only <space> get <space> tim <eos> Visemes: <sos> t ah <space> ah t er t <space> ao k w iy <space> k ey t <space> k ey p ah k <eos> WER: 0.0909 BLEU Score: 0.8071 -------------------------------------------------- Original: <sos> it <space> is <space> not <space> often <space> i <space> lose <space> my <space> temper <eos> Predicted: <sos> it <space> is <space> not <space> often <space> i <space> lose <space> my <space> nickname <eos> Visemes: <sos> iy t <space> iy t <space> k aa t <space> ao f ah k <space> aa <space> w uh t <space> p aa <space> t ey p p er <eos> WER: 0.0588 BLEU Score: 0.8844 -------------------------------------------------- Original: <sos> this <space> was <space> the <space> time <space> in <space> which <eos> Predicted: <sos> this <space> was <space> the <space> time <space> in <space> which <eos> Visemes: <sos> t iy t <space> w aa t <space> t ah <space> t aa p <space> iy k <space> w iy ch <eos> WER: 0.0000 BLEU Score: 1.0000 -------------------------------------------------- Original: <sos> the <space> show <space> where <space> a <space> team <space> of <space> five <space> quiz <space> challengers <space> pit <space> their <space> wits <space> against <space> possibly <space> the <space> greatest <space> quiz <eos> Predicted: <sos> the <space> show <space> where <space> a <space> team <space> of <space> five <space> quiz <space> challengers <space> pit <space> their <space> wits <space> against <space> possibly <space> the <space> greatest <space> quiz <eos> Visemes: <sos> t ah <space> ch ao <space> w ey w <space> ah <space> t iy p <space> ah f <space> f aa f <space> k w iy t <space> ch ey w ah k ch er t <space> p iy t <space> t ey w <space> w iy t t <space> ah k ey k t t <space> p aa t ah p w iy <space> t ah <space> k w ey t ah t t <space> k w iy t <eos> WER: 0.0000 BLEU Score: 1.0000 -------------------------------------------------- Original: <sos> and <space> so <space> we <space> have <space> tried <space> to <space> be <space> good <space> mannered <space> by <space> saying <space> ten <eos> Predicted: <sos> and <space> so <space> we <space> have <space> tried <space> to <space> be <space> good <space> topical <space> by <space> saying <space> ten <eos> Visemes: <sos> ah k t <space> t ao <space> w iy <space> w ey f <space> t w aa t <space> t uh <space> p iy <space> k uh t <space> p ey k er t <space> p aa <space> t ey iy k <space> t ey k <eos> WER: 0.0400 BLEU Score: 0.8895 -------------------------------------------------- Original: <sos> while <space> annie's <space> still <space> a <space> babe <eos> Predicted: <sos> while <space> tim <space> still <space> a <space> peake <eos> Visemes: <sos> w aa w <space> ey k iy <space> ey t <space> t t iy w <space> ah <space> p ey p <eos> WER: 0.1818 BLEU Score: 0.4833 -------------------------------------------------- Original: <sos> which <space> was <space> the <space> area <space> that <space> they <space> wanted <space> us <space> to <space> take <space> the <space> casualty <eos> Predicted: <sos> which <space> was <space> the <space> area <space> that <space> they <space> wanted <space> us <space> to <space> take <space> the <space> depth <eos> Visemes: <sos> w iy ch <space> w aa t <space> t ah <space> ey w iy ah <space> t ey t <space> t ey <space> w aa k t ah t <space> ah t <space> t uh <space> t ey k <space> t ah <space> k ey ch ah w ah w t iy <eos> WER: 0.0400 BLEU Score: 0.9245 -------------------------------------------------- Original: <sos> who <space> do <space> i <space> see <space> about <space> a <space> death <space> certificate <eos> Predicted: <sos> who <space> do <space> i <space> see <space> about <space> a <space> death <space> certificate <eos> Visemes: <sos> w uh <space> t uh <space> aa <space> t iy <space> ah p aa t <space> ah <space> t ey t <space> t er t iy f iy k ah t <eos> WER: 0.0000 BLEU Score: 1.0000 -------------------------------------------------- Original: <sos> reported <space> back <space> to <space> his <space> cabinet <eos> Predicted: <sos> reported <space> back <space> to <space> his <space> cabinet <eos> Visemes: <sos> w iy p ao w t ah t <space> p ey k <space> t uh <space> w iy t <space> k ey p ah k ah t <eos> WER: 0.0000 BLEU Score: 1.0000 -------------------------------------------------- Original: <sos> you <space> want <space> to <space> say <space> to <space> people <space> round <space> the <space> dining <space> table <eos> Predicted: <sos> you <space> want <space> to <space> say <space> to <space> people <space> round <space> the <space> dining <space> table <eos> Visemes: <sos> w uh <space> w aa k t <space> t uh <space> t ey <space> t uh <space> p iy p ah w <space> w aa k t <space> t ah <space> t aa k iy k <space> t ey p ah w <eos> WER: 0.0000 BLEU Score: 1.0000 -------------------------------------------------- Original: <sos> sold <space> in <space> aid <space> of <space> children <space> in <space> need <eos> Predicted: <sos> sold <space> in <space> aid <space> of <space> children <space> in <space> need <eos> Visemes: <sos> t ao w t <space> iy k <space> ey t <space> ah f <space> ch iy w t w ah k <space> iy k <space> k iy t <eos> WER: 0.0000 BLEU Score: 1.0000 -------------------------------------------------- Average WER: 0.0303 Average BLEU Score: 0.9245
In [ ]:
import matplotlib.pyplot as plt
# Create a list of set labels
sets = [f"Set {i+1}" for i in range(num_examples)]
# Create a figure and a set of subplots
fig, ax = plt.subplots(2, 1, figsize=(12, 10))
# Plot WER scores with markers
ax[0].plot(sets, wer_scores, color='steelblue', marker='o', label='WER per Set')
ax[0].axhline(average_wer, color='coral', linestyle='dashed', linewidth=1, label=f'Average WER: {average_wer:.4f}')
ax[0].set_title('Word Error Rate (WER) for Each Set')
ax[0].set_ylabel('WER')
ax[0].set_xticks(sets)
ax[0].set_xticklabels(sets, rotation=45)
ax[0].legend()
# Plot BLEU scores with markers
ax[1].plot(sets, bleu_scores, color='steelblue', marker='o', label='BLEU Score per Set')
ax[1].axhline(average_bleu, color='coral', linestyle='dashed', linewidth=1, label=f'Average BLEU: {average_bleu:.4f}')
ax[1].set_title('BLEU Score for Each Set')
ax[1].set_ylabel('BLEU Score')
ax[1].set_xticks(sets)
ax[1].set_xticklabels(sets, rotation=45)
ax[1].legend()
# Adjust the layout
plt.tight_layout()
plt.show()
In [ ]: