# import the necessary libraries
from sklearn.model_selection import train_test_split
import nltk
nltk.download('punkt')
nltk.download('perluniprops')
import re
import pprint
from nltk import word_tokenize
from nltk.tokenize import RegexpTokenizer
from nltk.tokenize.moses import MosesDetokenizer
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import random
import math
# the text used as corpus for the project
file = "europarl-v7.ro-en.en"
text_list = []
# read the whole the text
with open(file, encoding="utf8") as file:
for line in file:
text_list.append(line)
# split the text into sentences and randomly select the 33% of them as test set
train, test = train_test_split(text_list, test_size=0.33, random_state=42)
In the following code block, the text for the bigram model and the trigram model is constructed. The main procedure that takes place is the addition of the tokens "start" and "end" at the beginning and the end of each sentence respectively. Moreover, the words appearing less than ten times are replaced with the token "UNK". Finally, the constructed texts are saved into the files "unigram.txt", "bigram.txt" and "trigram.txt". The whole process takes more or less 6 hours in a normal computer, so it would be advised not to run the following code . The .txt files have already been created.
tokenizer = RegexpTokenizer(r'\w+')
# tokenize the text and calculate the frequency of each token
text = " ".join(train)
text = text.replace("\n", "*new_line*")
tokens = word_tokenize(text)
fdist = nltk.FreqDist(tokens)
# replace the least frequent tokens
# (i.e. tokens appearing less than 10 times with *UNK*)
tokens_less10 = [k for k, v in fdist.items() if v <= 10]
tokens = [i if i not in tokens_less10 else "*UNK*" for i in tokens]
# reconstruct the text, with the least frequent words replaced
detokenizer = MosesDetokenizer()
text = detokenizer.detokenize(tokens, return_str=True)
text = text.replace("*new_line*", "\n")
text_file = open("unigram.txt", "w", encoding="utf8")
text_file.write(text)
text_file.close()
# read again the reconstructed text
train = []
for line in text.split("\n"):
train.append(line.strip())
# build the bigrams by adding start1 token at the begining of each sentence
# and end12 at the end of each sentence
# build the trigrams by adding start1 start2 tokens at the begining of each
# sentence and end12 at the end of each sentence
bigram_text = ""
trigram_text = ""
for line in train:
if len(line) >= 2 and line[-2] == ".":
bigram_text += "*start1* " + line[:-2] + " *end12*"
trigram_text += "*start1* *start2* " + line[:-2] + " *end12*"
else:
bigram_text += "*start1* " + line + " *end12*"
trigram_text += "*start1* *start2* " + line + " *end12*"
# save the results in text files
text_file = open("bigram.txt", "w", encoding="utf8")
text_file.write(bigram_text)
text_file.close()
text_file = open("trigram.txt", "w", encoding="utf8")
text_file.write(trigram_text)
text_file.close()
# tokenize the bigrams
tokens = tokenizer.tokenize(bigram_text)
bgs = nltk.bigrams(tokens)
fdist_bgs = nltk.FreqDist(bgs)
# tokenize the trigrams
tokens = tokenizer.tokenize(trigram_text)
tgs = nltk.trigrams(tokens)
fdist_tgs = nltk.FreqDist(tgs)
The following code snippet implement the necessary preprocessing for the Kneser-Ney algorithm. More specifically, we construct the bigrams and their frequencies as well as the trigrams and their frequencies. In addition, some other useful info are extracted and three dataframes are created, one for unigrams, one for bigrams and one for trigrams.
tokenizer = RegexpTokenizer(r'\w+')
# read the file with bigram adjusted text
bigram_file = open("bigram.txt", "r", encoding="utf8")
bigram_text = bigram_file.read()
# tokeize the bigram text and create the bigrams
tokens = tokenizer.tokenize(bigram_text)
bgs = nltk.bigrams(tokens)
# calculate the frequencies of the bigrams
fdist_bgs = nltk.FreqDist(bgs)
fdist = nltk.FreqDist(tokens)
fdist.pop('start1', None)
fdist.pop('end12', None)
# repeat the same process for the trigram adjusted text
trigram_file = open("trigram.txt", "r", encoding="utf8")
trigram_text = trigram_file.read()
tokens = tokenizer.tokenize(trigram_text)
tgs = nltk.trigrams(tokens)
fdist_tgs = nltk.FreqDist(tgs)
# initalize a dataframe with the necessary info for the bigrams
df_bigram = pd.DataFrame(list(fdist_bgs.items()), columns=["bigram", 'count'])
# one column with the first word of the bigram
df_bigram["first_word"] = [x[0] for x in fdist_bgs]
# one column with the second word of the bigram
df_bigram["second_word"] = [x[1] for x in fdist_bgs]
# sort the dataframe on the first word of the bigram
df_bigram = df_bigram.sort_values(by=["first_word"])
df_bigram.reset_index(inplace=True)
# initalize a dataframe with the necessary info for the trigrams
df_trigram = pd.DataFrame(list(fdist_tgs.items()),
columns=["trigram", 'count'])
# one column with the first word of the trigram
df_trigram["first_word"] = [x[0] for x in fdist_tgs]
# one column with the second word of the trigram
df_trigram["second_word"] = [x[1] for x in fdist_tgs]
# one column with the third word of the trigram
df_trigram["third_word"] = [x[2] for x in fdist_tgs]
# column with a tuple containing the first and the second word of the trigram
df_trigram["pre"] = [x[0:2] for x in fdist_tgs]
# column with a tuple containing the second and the third word of the trigram
df_trigram["post"] = [x[1:3] for x in fdist_tgs]
# sort the dataframe on the first word of the trigram
df_trigram = df_trigram.sort_values(by=["first_word"])
# initalize a dataframe with the necessary info for the unigrams
df_unigram = pd.DataFrame(list(fdist.items()), columns=["unigram", 'count'])
# sort the dataframe on the unigram
df_unigram = df_unigram.sort_values(by=['unigram'])
df_unigram.reset_index(inplace=True)
def addKNUnigram(df_unigram, df_bigram, df_trigram, test, D=0.75):
# tokenize the test, i.e. the sentence for which the smoothed probability will be calculated
unigrams_test = tokenizer.tokenize(test)
# if a token is not found in the unigrams of the training set replace it with "UNK"
unigrams_test = [t if t in df_unigram.unigram.values else "UNK" for t in unigrams_test]
# calculate the frequencies of the tokens
fdist = nltk.FreqDist(unigrams_test)
# add the token start1 at the begining of the test sentence and the token end12 at the end
test_bgs = 'start1 ' + test.strip() + ' end12'
# create the bigrams
line_tokens_bgs = tokenizer.tokenize(test_bgs)
line_tokens_bgs = [t if (t in df_unigram.unigram.values or t in ["start1", "end12"]) else "UNK" for t in line_tokens_bgs]
# create the bigrams
bigrams_test = nltk.bigrams(line_tokens_bgs)
fdist_bgs = nltk.FreqDist(bigrams_test)
# add the tokens start1 start2 at the begining of the test sentence and the token end12 at the end
test_tgs = 'start1 start2 ' + test.strip() + ' end12'
line_tokens_tgs = tokenizer.tokenize(test_tgs)
line_tokens_tgs = [t if (t in df_unigram.unigram.values or t in ["start1", "end12", "start2"]) else "UNK" for t in line_tokens_tgs]
# create the trigrams
trigrams_test = nltk.trigrams(line_tokens_tgs)
fdist_tgs = nltk.FreqDist(trigrams_test)
unigrams_test_df = pd.DataFrame(unigrams_test)
bigrams_test_df = pd.DataFrame(bigrams_test)
trigrams_test_df = pd.DataFrame(trigrams_test)
# create subsets of the trainig dataframes containing information
# only for the tokens, which exist in the test sentence
sub_unigram = df_unigram[df_unigram["unigram"].isin(unigrams_test)].copy()
sub_bigram = df_bigram[df_bigram["bigram"].isin(fdist_bgs.keys())].copy()
sub_trigram = pd.DataFrame(columns=["trigram", 'count', 'first_word', 'second_word', 'third_word', 'pre', 'post'])
for i in fdist_tgs.keys():
for j in df_trigram["trigram"]:
if i == j:
sub_trigram = sub_trigram.append(df_trigram[df_trigram["trigram"] == j])
#sub_trigram = df_trigram[df_trigram["trigram"].isin(list(fdist_tgs.keys()))].copy()
df_uni_final = pd.DataFrame(columns=["unigram", 'count'])
df_bgs_final = pd.DataFrame(columns=["bigram", 'count', 'first_word', 'second_word'])
df_tgs_final = pd.DataFrame(columns=["trigram", 'count', 'first_word', 'second_word', 'third_word', 'pre', 'post'])
for i in unigrams_test:
df_uni_final = df_uni_final.append(sub_unigram[sub_unigram.unigram == i])
for i in list(nltk.bigrams(line_tokens_bgs)):
if i in list(sub_bigram.bigram):
df_bgs_final = df_bgs_final.append(sub_bigram[sub_bigram.bigram == i])
else:
df_bgs_final = df_bgs_final.append({"bigram":i, "count":0, "first_word":i[0], "second_word":i[1]}, ignore_index=True)
for i in list(nltk.trigrams(line_tokens_tgs)):
if i in list(sub_trigram.trigram):
df_tgs_final = df_tgs_final.append(sub_trigram[sub_trigram.trigram == i])
else:
df_tgs_final = df_tgs_final.append({"trigram":i, "count":0, "first_word":i[0], "second_word":i[1],
"third_word":i[2], "pre":(i[0], i[1]),
"post":(i[1], i[2])}, ignore_index=True)
sub_unigram = df_uni_final.copy()
sub_bigram = df_bgs_final.copy()
sub_trigram = df_tgs_final.copy()
bigrams = len(df_bigram)
temp_df2 = df_bigram.copy()
# NWordDot, how many bigrams begin with specific word
NWordDot = pd.DataFrame(columns=["WordDot", 'count'])
# NDotWord, how many bigrams end with the specific word
NDotWord = pd.DataFrame(columns=["DotWord", 'count'])
for b in list(sub_bigram.bigram):
# how many bigram begin with that word
NWordDot = NWordDot.append({"WordDot":b[0], "count": temp_df2[temp_df2.first_word==b[0]].groupby(['first_word']).size().values}, ignore_index=True)
# how many bigram end with that word
NDotWord = NDotWord.append({"DotWord":b[1], "count": temp_df2[temp_df2.second_word==b[1]].groupby(['second_word']).size().values}, ignore_index=True)
# NDotWordDot, how many trigrams have that word as second word
NDotWordDot = pd.DataFrame(columns=["DotWordDot", 'count'])
temp_df3 = df_trigram.copy()
for t in list(sub_trigram.trigram):
# NDotWordDot, in how many trigrams the word is in the middle
NDotWordDot = NDotWordDot.append({"DotWordDot":t[1], "count": temp_df3[temp_df3.second_word==t[1]].groupby(['second_word']).size().values}, ignore_index=True)
# remove from the dataframes the enries containing the tokens
# start1, start2, end12
NDotWordDot = NDotWordDot[NDotWordDot.DotWordDot!="start2"]
NDotWordDot = NDotWordDot[NDotWordDot.DotWordDot!="start1"]
NDotWordDot = NDotWordDot[NDotWordDot.DotWordDot!="end12"]
NWordDot = NWordDot[NWordDot.WordDot!="end12"]
NWordDot = NWordDot[NWordDot.WordDot!="start1"]
NWordDot = NWordDot[NWordDot.WordDot!="start2"]
NDotWord = NDotWord[NDotWord.DotWord!="end12"]
NDotWord = NDotWord[NDotWord.DotWord!="start1"]
NDotWord = NDotWord[NDotWord.DotWord!="start2"]
# calculate parts of the Kneser-Ney algorithm and
# save thoses calculated fiels in the sub_unigram dataframe
sub_unigram["Pcont"] = (NDotWord["count"]/float(bigrams)).values
sub_unigram.loc[:, 'Pcont'] = sub_unigram.Pcont.map(lambda x: x[0])
sub_unigram["lambda"] = ((float(D)*NWordDot["count"]) / NDotWordDot["count"]).values
sub_unigram.loc[:, 'lambda'] = sub_unigram["lambda"].map(lambda x: x[0])
sub_unigram["NDotWordDot"] = NDotWordDot["count"].values
sub_unigram.loc[:, 'NDotWordDot'] = sub_unigram.NDotWordDot.map(lambda x: x[0])
sub_unigram["NWordDot"] = NWordDot["count"].values
sub_unigram.loc[:, 'NWordDot'] = sub_unigram.NWordDot.map(lambda x: x[0])
sub_unigram.fillna(0, inplace=True)
#sub_unigram = sub_unigram.sort_values(by=['unigram'])
sub_unigram.reset_index(inplace=True)
del sub_unigram["index"]
del sub_unigram["level_0"]
# return the subsets of the training data
return(sub_unigram, sub_bigram, sub_trigram)
def addKNBigram(df_unigram, df_bigram, df_trigram, sub_unigram, sub_bigram, sub_trigram, D=0.75):
temp_df3 = df_trigram.copy()
# NDotW1W2, how many trigrams end with those two specific words
NDotW1W2 = pd.DataFrame(columns=["DotW1W2", 'count'])
# NDotW1W2, how many trigrams begin with those two specific words
NW1W2Dot = pd.DataFrame(columns=["W1W2Dot", 'count'])
for t in list(sub_trigram.trigram):
NDotW1W2 = NDotW1W2.append({"DotW1W2": (t[1], t[2]), "count": len(temp_df3[temp_df3.post==(t[1], t[2])].groupby(['post']).size().values)}, ignore_index=True)
NW1W2Dot = NW1W2Dot.append({"W1W2Dot": (t[0], t[1]), "count": len(temp_df3[temp_df3.post==(t[0], t[1])].groupby(['pre']).size().values)}, ignore_index=True)
unigram_temp = sub_unigram.copy()
sub_bigram["mod_count"] = sub_bigram["count"]
stopWords = ["start1", "end12"]
for index, row in sub_bigram.iterrows():
# if a test bigram is not found in the training set, then its count is replaced by the count of its first word
if row["mod_count"] == 0:
if row["first_word"] in stopWords:
sub_bigram.loc[index, "mod_count"] = df_unigram[df_unigram["unigram"] == "UNK"]["count"].values[0]
else:
sub_bigram.loc[index, "mod_count"] = sub_unigram[sub_unigram["unigram"] == row["first_word"]]["count"].values[0]
#sub_bigram.reset_index(inplace=True)
# adjust the dataframes to have the same length, in order
# to make the calculations correctly
temp_bgs = sub_bigram[:-1]
temp_ndw1w2dot = NW1W2Dot.iloc[1:]
#temp_bgs.reset_index(inplace=True)
# implement some initial calculations for the algorithm
lambda_2 = list((D /temp_bgs["mod_count"]).values * temp_ndw1w2dot["count"].values)
lambda_2.append(None)
sub_bigram["lambda2"] = lambda_2
unigram_temp.set_index('unigram', inplace=True)
trigram_temp = sub_trigram.copy()
for index, row in sub_bigram.iterrows():
w1 = row["first_word"]
w2 = row["second_word"]
if w2 not in stopWords and w1 not in stopWords:
# extract the necessary parts for the calculations
nDotW1W2 = pd.Series(NDotW1W2[NDotW1W2["DotW1W2"] == (w1, w2)]["count"]).values[0]
nDotWordDot = pd.Series(unigram_temp.loc[w2]["NDotWordDot"]).values[0]
lambda_bgs = pd.Series(unigram_temp.loc[w2]["lambda"]).values[0]
pcont_bgs = pd.Series(unigram_temp.loc[w2]["Pcont"]).values[0]
cW1W2 = pd.Series(sub_bigram.loc[index, "mod_count"]).values[0]
cW1 = pd.Series(unigram_temp.loc[w1]["count"]).values[0]
nWordDot = pd.Series(unigram_temp.loc[w1]["NWordDot"]).values[0]
if nDotWordDot == 0:
sub_bigram.loc[index, "Pcont2"] = 0
sub_bigram.loc[index, "KNSmoothing_BGS"] = 0
else:
sub_bigram.loc[index, "Pcont2"] = (max(nDotW1W2 - D, 0.0)/nDotWordDot) + lambda_bgs * pcont_bgs
if (pd.Series(sub_bigram.loc[index, "count"]).values[0] == 0):
# if a bigram is not found in trainig set, calculate its probability following
# process similar to Laplace. Add to the denominato the number of the unigrams, in
# order to make the whole probability smaller.
sub_bigram.loc[index, "KNSmoothing_BGS"] = (max(cW1W2 - D, 0.0)/(cW1 + len(df_unigram))) + (D * nWordDot/cW1 * cW1W2/len(df_bigram))
else:
sub_bigram.loc[index, "KNSmoothing_BGS"] = (max(cW1W2 - D, 0.0)/cW1) + (D * nWordDot/cW1 * cW1W2/len(df_bigram))
#del sub_bigram["level_0"]
#del sub_bigram["index"]
# return the updated dataframe
return(sub_bigram)
def addKNTrigram(df_unigram, df_bigram, df_trigram, sub_unigram, sub_bigram, sub_trigram, D=0.75):
bigram_temp = sub_bigram.copy()
stopWords = ["start1", "start2", "end12"]
NlambdaW1W2 = pd.DataFrame(columns=["W1W2", 'lambdaW1W2'])
NprobW2W3 = pd.DataFrame(columns=["W2W3", 'probW2W3'])
for index, row in sub_trigram.iterrows():
pre = row["pre"]
post = row["post"]
w1 = row["first_word"]
w2 = row["second_word"]
w3 = row["third_word"]
cW1W2W3 = pd.Series(row["count"]).values[0]
if w2 not in stopWords and w1 not in stopWords and w3 not in stopWords:
cW1W2 = pd.Series(bigram_temp[bigram_temp.bigram==pre]["mod_count"]).values[0]
cW2 = pd.Series(sub_unigram[sub_unigram.unigram==w2]["count"]).values[0]
if cW1W2 == 0 or cW1W2W3 == 0:
# if a trigram is not found in the training set then as count is
# used the count of the middle word, while in the denominator it is also
# added the number of the bigrams
sub_trigram.loc[index, "MaxLikelTerm"] = max(cW2-D,0)/(len(df_bigram) + cW2)
else:
sub_trigram.loc[index, "MaxLikelTerm"] = max(cW1W2W3-D,0)/cW1W2
temp_lambda2 = bigram_temp[["bigram", "lambda2"]]
data = []
data.insert(0, {'bigram': '(start1, start2)', 'lambda2': None})
temp_lambda2= pd.concat([pd.DataFrame(data), temp_lambda2], ignore_index=True)
temp_lambda2 = temp_lambda2[:-1]
temp_Pcont2 = bigram_temp[["bigram", "Pcont2"]]
# calculate the smoothed probabilty for the trigram model
sub_trigram["KNSmoothing_TGS"] = sub_trigram["MaxLikelTerm"].values + temp_lambda2["lambda2"].values * temp_Pcont2["Pcont2"].values
return(sub_trigram)
We compare the log-probabilities of correct sentences as far as structure is concerned with sentences randomly generated. In general, the correctly structured sentences should be more probable and from the results it is obvious that this happens almost always in the trigram model.
# Check the log-probabilities that the trained models return when (correct) sentences
# from the test subset are given vs. (incorrect) sentences of the same length (in words)
# consisting of randomly selected vocabulary words.
def eval_bigram(test):
detokenizer = MosesDetokenizer()
testdf = pd.DataFrame(columns=["correct_sentence","logProb_cs","wrong_sentence","logProb_ws"])
tokenizer = RegexpTokenizer(r'\w+')
for sentence in test:
test_tokenized = tokenizer.tokenize(sentence)
random.shuffle(test_tokenized)
random_test = detokenizer.detokenize(test_tokenized, return_str=True)
correct_uni = (addKNUnigram (df_unigram, df_bigram, df_trigram, sentence))
correct_bgs = (addKNBigram (df_unigram, df_bigram, df_trigram, correct_uni[0], correct_uni[1], correct_uni[2]))
wrong_uni = (addKNUnigram (df_unigram, df_bigram, df_trigram, random_test))
wrong_bgs = (addKNBigram (df_unigram, df_bigram, df_trigram, wrong_uni[0], wrong_uni[1], wrong_uni[2]))
prob_ws = sum(np.log(wrong_bgs["KNSmoothing_BGS"].dropna(axis=0, how='all')))
prob_cs = sum(np.log(correct_bgs["KNSmoothing_BGS"].dropna(axis=0, how='all')))
testdf = testdf.append({"correct_sentence": sentence, "logProb_cs": prob_cs,
"wrong_sentence": random_test, "logProb_ws": prob_ws},
ignore_index=True)
display(testdf)
def eval_trigram(test):
detokenizer = MosesDetokenizer()
testdf = pd.DataFrame(columns=["correct_sentence","logProb_cs","wrong_sentence","logProb_ws"])
tokenizer = RegexpTokenizer(r'\w+')
for sentence in test:
test_tokenized = tokenizer.tokenize(sentence)
random.shuffle(test_tokenized)
random_test = detokenizer.detokenize(test_tokenized, return_str=True)
correct_uni = (addKNUnigram (df_unigram, df_bigram, df_trigram, sentence))
correct_bgs = (addKNBigram (df_unigram, df_bigram, df_trigram, correct_uni[0], correct_uni[1], correct_uni[2]))
correct_tgs = (addKNTrigram (df_unigram, df_bigram, df_trigram, correct_uni[0], correct_bgs, correct_uni[2]))
wrong_uni = (addKNUnigram (df_unigram, df_bigram, df_trigram, random_test))
wrong_bgs = (addKNBigram (df_unigram, df_bigram, df_trigram, wrong_uni[0], wrong_uni[1], wrong_uni[2]))
wrong_tgs = (addKNTrigram (df_unigram, df_bigram, df_trigram, wrong_uni[0], wrong_bgs, wrong_uni[2]))
prob_ws = sum(np.log(wrong_tgs["KNSmoothing_TGS"].dropna(axis=0, how='all')))
prob_cs = sum(np.log(correct_tgs["KNSmoothing_TGS"].dropna(axis=0, how='all')))
testdf = testdf.append({"correct_sentence": sentence, "logProb_cs": prob_cs,
"wrong_sentence": random_test, "logProb_ws": prob_ws},
ignore_index=True)
display(testdf)
eval_bigram(test[:25])
eval_trigram(test[:25])
The aim of the following code snippet is to predict the next word as in a predictive keyboard. Given a sentence we focus mainly on the last part of it (i.e mostly the last four words) and we predict the next word. If the last token does not exist in the vocabulary of the trained model we utilize the edit distance and among the closest words the most probable bigrams or trigrams are chosen. Although we implemented edit distance we used the implementation of nltk for efficiency reasons and we just set the substitution cost to two. With that approach we simulate better real case scenarios.
# The above models could be used to predict the next (vocabulary) word, as in a predictive keyboard
# the method returns the ten most probable bigrams begining with the given word
def build_bigrams(next_word, df_bigram):
tokenizer = RegexpTokenizer(r'\w+')
tokenized = tokenizer.tokenize(next_word)
tokenized = tokenized[-1]
sub_bigram = df_bigram[(df_bigram.first_word == tokenized) & (df_bigram.second_word != "UNK") & (df_bigram.second_word != "end12")][["bigram", "count"]]
sub_bigram.sort_values('count', ascending=False, inplace=True)
sub_bigram = sub_bigram.head(10)
sub_bigram = sub_bigram["bigram"]
sub_bigram = sub_bigram.apply(lambda x: (' '.join(x)))
return sub_bigram
# the methods returns the ten most probable trigrams begining with the given words
def build_trigrams(next_word, df_trigram):
tokenizer = RegexpTokenizer(r'\w+')
tokenized = tokenizer.tokenize(next_word)
tokenized = tokenized[-2:]
sub_trigram = df_trigram[(df_trigram.first_word == tokenized[0]) & (df_trigram.second_word == tokenized[1]) & (df_trigram.third_word != "UNK") & (df_bigram.second_word != "end12")][["trigram", "count"]]
sub_trigram.sort_values('count', ascending=False, inplace=True)
sub_trigram = sub_trigram.head(10)
sub_trigram = sub_trigram["trigram"]
sub_trigram = sub_trigram.apply(lambda x: (' '.join(x)))
return sub_trigram
# the method calculates the top three most probable words in the given context
# In order to achieve that the models built above are used. The smoothed probabilities
# are calculated for different n-grams, and the words resulting in the highest probability
# are chosen.
def pred_next_word(next_word, df_bigram, df_trigram, df_unigram):
tokenizer = RegexpTokenizer(r'\w+')
tokenized = tokenizer.tokenize(next_word)
pred_words = pd.DataFrame(columns=["n-gram","logProb"])
if (len(tokenized) > 0):
next_bigram = build_bigrams(next_word, df_bigram)
for index, row in next_bigram.iteritems():
correct_uni = (addKNUnigram (df_unigram, df_bigram, df_trigram, row))
correct_bgs = (addKNBigram (df_unigram, df_bigram, df_trigram, correct_uni[0], correct_uni[1], correct_uni[2]))
prob_cs = sum(np.log(correct_bgs["KNSmoothing_BGS"].dropna(axis=0, how='all')))
pred_words = pred_words.append({"n-gram": row, "logProb": prob_cs},
ignore_index=True)
if len(tokenized) > 1:
next_trigram = build_trigrams(next_word, df_trigram)
for index, row in next_trigram.iteritems():
correct_uni = (addKNUnigram (df_unigram, df_bigram, df_trigram, row))
correct_bgs = (addKNBigram (df_unigram, df_bigram, df_trigram, correct_uni[0], correct_uni[1], correct_uni[2]))
correct_tgs = (addKNTrigram (df_unigram, df_bigram, df_trigram, correct_uni[0], correct_bgs, correct_uni[2]))
prob_cs = sum(np.log(correct_tgs["KNSmoothing_TGS"].dropna(axis=0, how='all')))
pred_words = pred_words.append({"n-gram": row, "logProb": prob_cs},
ignore_index=True)
tokenized = tokenized[-1]
pred_words.sort_values('logProb', ascending=False, inplace=True)
df_unigram.sort_values('count', ascending=False, inplace=True)
pred_words = pred_words.head(5)
tokenizer = RegexpTokenizer(r'\w+')
top_words = set()
for index, row in pred_words.iterrows():
tokenized = tokenizer.tokenize(row["n-gram"])
top_words.add(tokenized[-1])
dist_words = pd.DataFrame(columns=["word","dist"])
for index, row in df_unigram.iterrows():
dist_words = dist_words.append({"word":row["unigram"], "dist": nltk.edit_distance(row["unigram"], tokenized)},
ignore_index=True)
dist_words.sort_values('dist', ascending=True, inplace=True)
dist_words = pd.DataFrame(dist_words.head(3 - len(top_words)))
for index, row in dist_words.iterrows():
if len(top_words) < 3:
top_words.add(row["word"])
print("Predictions: ", top_words)
# The above models could be used to predict the next (vocabulary) word, as in a predictive keyboard.
# However, the above approach works if the last word exists in the vocabulary. If the word does not exist,
# the following approach is proposed, where the Levenshtein distance (edit - distance) is calculated. Then, among
# the closest words the most probable combinations/n-grams are chosen. This case is more generic and more realistic.
def build_bigrams_edit(next_word, df_bigram):
tokenizer = RegexpTokenizer(r'\w+')
tokenized = tokenizer.tokenize(next_word)
sub_bigram = df_bigram[(df_bigram.first_word == tokenized[-2]) & (df_bigram.second_word != "UNK") & (df_bigram.second_word != "end12")][["bigram","count", "second_word"]]
sub_bigram.sort_values('count', ascending=False, inplace=True)
sub_bigram = sub_bigram.head(30)
for index, row in sub_bigram.iterrows():
sub_bigram.loc[index, "dist"] = nltk.edit_distance(row["second_word"], tokenized[-1], substitution_cost=2)
sub_bigram.sort_values('dist', ascending=True, inplace=True)
sub_bigram = sub_bigram.head(10)
sub_bigram = sub_bigram["bigram"]
sub_bigram = sub_bigram.apply(lambda x: (' '.join(x)))
return sub_bigram
def build_trigrams_edit(next_word, df_trigram):
tokenizer = RegexpTokenizer(r'\w+')
tokenized = tokenizer.tokenize(next_word)
sub_trigram = df_trigram[(df_trigram.first_word == tokenized[-3]) & (df_trigram.second_word == tokenized[-2]) & (df_trigram.third_word != "UNK") & (df_bigram.second_word != "end12")][["trigram", "count", "third_word"]]
sub_trigram.sort_values('count', ascending=False, inplace=True)
sub_trigram = sub_trigram.head(30)
for index, row in sub_trigram.iterrows():
sub_trigram.loc[index, "dist"] = nltk.edit_distance(row["third_word"], tokenized[-1], substitution_cost=2)
sub_trigram.sort_values('dist', ascending=True, inplace=True)
sub_trigram = sub_trigram.head(10)
sub_trigram = sub_trigram["trigram"]
sub_trigram = sub_trigram.apply(lambda x: (' '.join(x)))
return sub_trigram
def pred_next_word_edit(next_word, df_bigram, df_trigram, df_unigram):
tokenizer = RegexpTokenizer(r'\w+')
tokenized = tokenizer.tokenize(next_word)
pred_words = pd.DataFrame(columns=["n-gram","logProb"])
if (len(tokenized) > 1):
next_bigram = build_bigrams_edit(next_word, df_bigram)
for index, row in next_bigram.iteritems():
correct_uni = (addKNUnigram (df_unigram, df_bigram, df_trigram, row))
correct_bgs = (addKNBigram (df_unigram, df_bigram, df_trigram, correct_uni[0], correct_uni[1], correct_uni[2]))
prob_cs = sum(np.log(correct_bgs["KNSmoothing_BGS"].dropna(axis=0, how='all')))
pred_words = pred_words.append({"n-gram": row, "logProb": prob_cs},
ignore_index=True)
if len(tokenized) > 2:
next_trigram = build_trigrams_edit(next_word, df_trigram)
for index, row in next_trigram.iteritems():
correct_uni = (addKNUnigram (df_unigram, df_bigram, df_trigram, row))
correct_bgs = (addKNBigram (df_unigram, df_bigram, df_trigram, correct_uni[0], correct_uni[1], correct_uni[2]))
correct_tgs = (addKNTrigram (df_unigram, df_bigram, df_trigram, correct_uni[0], correct_bgs, correct_uni[2]))
prob_cs = sum(np.log(correct_tgs["KNSmoothing_TGS"].dropna(axis=0, how='all')))
pred_words = pred_words.append({"n-gram": row, "logProb": prob_cs},
ignore_index=True)
tokenized = tokenized[-1]
pred_words.sort_values('logProb', ascending=False, inplace=True)
df_unigram.sort_values('count', ascending=False, inplace=True)
pred_words = pred_words.head(5)
tokenizer = RegexpTokenizer(r'\w+')
top_words = set()
for index, row in pred_words.iterrows():
tokenized = tokenizer.tokenize(row["n-gram"])
top_words.add(tokenized[-1])
dist_words = pd.DataFrame(columns=["word","dist"])
for index, row in df_unigram.iterrows():
dist_words = dist_words.append({"word":row["unigram"], "dist": nltk.edit_distance(row["unigram"], tokenized)},
ignore_index=True)
dist_words.sort_values('dist', ascending=True, inplace=True)
dist_words = pd.DataFrame(dist_words.head(3 - len(top_words)))
for index, row in dist_words.iterrows():
if len(top_words) < 3:
top_words.add(row["word"])
else:
break
print("Predictions: ", top_words)
# method to check whether the last token exists in the vocabulary or not
# if it does not call the pred_next_word_edit() method else
# the pred_next_word() with the required arguments
def pred_next(next_word):
tokenizer = RegexpTokenizer(r'\w+')
tokenized = tokenizer.tokenize(next_word)
if (tokenized[-1]) not in df_unigram.unigram.values:
pred_next_word_edit(next_word, df_bigram, df_trigram, df_unigram)
else:
pred_next_word(next_word, df_bigram, df_trigram, df_unigram)
next_word = "the European uni"
pred_next(next_word)
next_word = "the European"
pred_next(next_word)
Calculate metrics for the two different models/approaches. In both cases, i.e. as far as perplexity and cross-entropy is concerned the trigram model seems to ouperform.
test_string_bgs = " ".join(test[:10])
correct_uni = (addKNUnigram (df_unigram, df_bigram, df_trigram, test_string_bgs))
correct_bgs = (addKNBigram (df_unigram, df_bigram, df_trigram, correct_uni[0], correct_uni[1], correct_uni[2]))
print ('Perplexity Bigram Model: ',
math.exp(sum(np.log(1/correct_bgs["KNSmoothing_BGS"].dropna(axis=0, how='all')))/len(correct_bgs["KNSmoothing_BGS"].dropna(axis=0, how='all'))))
print ('Cross-Entropy Bigram Model: ',
sum(-1*np.log(correct_bgs["KNSmoothing_BGS"].dropna(axis=0, how='all')))/len(correct_bgs["KNSmoothing_BGS"].dropna(axis=0, how='all')))
test_string_tgs = " ".join(test[:10])
correct_uni = (addKNUnigram (df_unigram, df_bigram, df_trigram, test_string_tgs))
correct_bgs = (addKNBigram (df_unigram, df_bigram, df_trigram, correct_uni[0], correct_uni[1], correct_uni[2]))
correct_tgs = (addKNTrigram (df_unigram, df_bigram, df_trigram, correct_uni[0], correct_bgs, correct_uni[2]))
print ('Perplexity Trigram Model: ',
math.exp(sum(np.log(1/correct_tgs["KNSmoothing_TGS"].dropna(axis=0, how='all')))/len(correct_tgs["KNSmoothing_TGS"].dropna(axis=0, how='all'))))
print ('Cross-Entropy Trigram Model: ',
sum(-1*np.log(correct_tgs["KNSmoothing_TGS"].dropna(axis=0, how='all')))/len(correct_tgs["KNSmoothing_TGS"].dropna(axis=0, how='all')))