import os
import pickle
import time
from datetime import datetime
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import numpy as np
import pandas as pd
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
nltk.download('stopwords')
The below cell loads the preprocessed and cleaned dataset. You may skip to "Generataing the Co-occurrence Matrix" Section
def load_data():
with open("corpus.txt", "rb") as fp:
corpus = pickle.load(fp)
with open("vocabulary.txt", "rb") as fp:
vocabulary = pickle.load(fp)
with open("word2idx.txt", "rb") as fp:
word2idx = pickle.load(fp)
with open("idx2word.txt", "rb") as fp:
idx2word = pickle.load(fp)
return corpus, vocabulary, word2idx, idx2word
corpus, vocabulary, word2idx, idx2word = load_data()
# NOTE : Skip to "Generating Co-occurrence Matrix" section.
def clean_sentence(sentence):
'''
Clens the sentence, removing stopwords, symbols and numeric values.
Also breaks the sentence down into tokens.
All words are conveerted into lowercase.
'''
stop_words = set(stopwords.words('english'))
stop_words.add('br')
tokens = re.findall(r"[\w]+", sentence)
filtered_tokens = [token for token in tokens if not token in stop_words]
filtered_tokens = [token.lower() for token in filtered_tokens]
filtered_tokens = [token.translate(token.maketrans("","", ".,:;''<>{}()[]_1234567890?")) for token in filtered_tokens]
return filtered_tokens
sentence = "This is a 'sample' sentence... to _verify_ the 99th <br /><br />function!"
clean_sentence(sentence)
test
/train
and pos
/neg
folders.dummy
contains a dummy dataset for model inspection.sample
[x] contains a subset of the whole dataset with 'x' different files. ### MAKING THE CORPUS ###
folder_list = [
#os.path.join("data","dummy")
#os.path.join("data","sample20"),
#os.path.join("data","sample50"),
#os.path.join("data","sample250"),
os.path.join("data","sample1000"),
#os.path.join("data","test","neg"),
#os.path.join("data","test","pos"),
#os.path.join("data","train","neg"),
#os.path.join("data","train","pos")
]
corpus = []
for folder in folder_list:
file_list = os.listdir(folder)
for file in file_list:
file_path = os.path.join(folder,file)
file_content = open(file_path,encoding='utf8')
data = file_content.read()
clean_data = clean_sentence(data)
corpus.append(clean_data)
### MAKING THE VOCABULARY ###
vocabulary = []
for sentence in corpus:
for token in sentence:
if token not in vocabulary:
vocabulary.append(token)
vocabulary.sort()
vocab_size = len(vocabulary)
word2idx = {w: idx for (idx, w) in enumerate(vocabulary)}
idx2word = {idx: w for (idx, w) in enumerate(vocabulary)}
### MAKING THE CO-OCCURRENCE MATRIX ###
window_size = 10
vocab_size = len(vocabulary)
X = np.zeros((vocab_size,vocab_size))
for token_list in corpus:
token_count = len(token_list)
for token_index, token in enumerate(token_list):
left = max(0,token_index-window_size)
right = min(token_count,token_index+window_size+1)
context_tokens_idx = [idx for idx in range(left,right) if idx != token_index]
context_tokens = [token_list[idx] for idx in context_tokens_idx]
#print(token)
#print(context_tokens)
for context_token in context_tokens:
vocab_index_centre = word2idx[token]
vocab_index_context = word2idx[context_token]
X[vocab_index_centre][vocab_index_context] += 1
def print_time(time_begin, time_end):
FMT = '%H:%M:%S'
td = (datetime.strptime(time_end[11:19], FMT) - datetime.strptime(time_begin[11:19], FMT)).seconds
hr = (td//3600)
min = (td - 3600*hr)//60
sec = (td - 3600*hr - 60*min)
print("Time taken : {:2.0f}:{:2.0f}:{:2.0f}".format(hr,min,sec))
def get_sim_val(a1,a2,b1,b2):
'''
Returns a real number that measures how well the relation a1:a2::b1:b2 holds for some words.
Lower is better.
'''
sim_vec = (U[word2idx[a1]] - U[word2idx[a2]] / U[word2idx[b1]] - U[word2idx[b2]])
sim = np.sqrt(np.mean(abs(sim_vec - sim_vec.mean())**2))
return sim
def measure_progress():
'''
Aggregates word similarities for a set of words.
Reports that as a measure of model performance.
Lower is better.
'''
val=0
v1 = get_sim_val("king", "queen", "man", "woman")
v2 = get_sim_val("king", "queen", "boy", "girl")
v3 = get_sim_val("boy", "girl", "man", "woman")
val = v1+v2+v3
print("Similarity Value : {:.5f}".format(val))
#print("v1,v2,v3 : {:.5f}, {:.5f}, {:.5f}".format(v1,v2,v3))
return val
#measure_progress()
print(len(corpus))
print(vocab_size)
print(vocab_size ** 2)
# Dimension of generated word vector
dimension = 15
# Embedding Matrix for Main word
U = (np.random.rand(vocab_size, dimension) - 0.5) / float(dimension + 1)
# Embedding Matrix for Context word
V = (np.random.rand(vocab_size, dimension) - 0.5) / float(dimension + 1)
# Bias for Main Word
bu = (np.random.rand(vocab_size).reshape(-1,1) - 0.5) / float(dimension + 1)
# Bias for Context Word
bv = (np.random.rand(vocab_size).reshape(-1,1) - 0.5) / float(dimension + 1)
iter_list = []
cost_list = []
#def train_glove(vocabulary, training_samples, dimension=5, iterations=10, learning_rate=0.01):
iterations = 50
learning_rate = 0.001
### Load a pretrained model ###
with open("params.txt", "rb") as fp:
params = pickle.load(fp)
with open("monitering_lists.txt", "rb") as fp:
monitering_lists = pickle.load(fp)
U,V,bu,bv = params
iter_list, cost_list = monitering_lists
iteration = iter_list[-1]
def get_weight(x_ij, x_max=100, alpha=.75):
if(x_ij < x_max):
return (x_ij/x_max) ** alpha
else: return 1
### Train the model ###
total_iteration = iteration + iterations
print("Beginning Training...")
for i in range(iterations):
time_begin = time.asctime()
print("-------------------------------------------")
iteration += 1
cost = 0
for i in range(vocab_size):
for j in range(vocab_size):
x_ij = X[i][j]
loss = 0
if(x_ij!= 0):
w = get_weight(x_ij)
loss_in = (np.dot(U[i],V[j]) + bu[i] + bv[i] - np.log(x_ij)).item()
loss = w * (loss_in**2)
cw = w * loss_in
U[i] = U[i] - learning_rate * (cw * V[j])
V[j] = V[j] - learning_rate * (cw * U[i])
bu[i:i+1] = bu[i:i+1] - (learning_rate * cw)
bv[j:j+1] = bv[j:j+1] - (learning_rate * cw)
cost += loss
#print(i,j,x_ij, loss_in, loss)
else:
flag=0
iter_list.append(iteration)
cost_list.append(cost)
print("Iteration : {}/{}".format(iteration,total_iteration))
print("Cost : {:.5f}".format(cost))
time_end = time.asctime()
print_time(time_begin, time_end)
sim = measure_progress()
print("-------------------------------------------")
print("Done.")
### Import the saved TSNE Vectors ###
with open("tsne_vecs.txt", "rb") as fp:
tsne_vecs = pickle.load(fp)
### Build TSNE Vectors ###
def build_tsne_vecs(U,V):
'''
Returns TSNE vecots given Matrices U and V.
'''
tsne_vecs = TSNE().fit_transform((U+V)/2)
print(tsne_vecs.shape)
return tsne_vecs
#tsne_vecs = build_tsne_vecs(U,V)
### Plot the Accuracy Graph ###
plt.figure(figsize=[20,10])
plt.plot(iter_list, cost_list, '-', lw=3, c='salmon', label='Train Accuracy')
plt.title('COST vs ITERATIONS', size='30')
plt.xlabel('Number of Iterations', size='20')
plt.ylabel('Cost', size='20')
plt.grid(True, linestyle='-.',)
plt.tick_params(labelcolor='k', labelsize='15', width=3)
plt.legend(fontsize='15')
fig1 = plt.gcf()
plt.show()
fig1.savefig('cost_vs_iterations.png', dpi=50)
### Visualise the Word Vecotrs ###
x = []
y = []
for point in tsne_vecs:
x.append(point[0])
y.append(point[1])
plt.figure(figsize=(50,50))
for i in range(0,vocab_size,5):
plt.scatter(x[i],y[i])
plt.annotate(idx2word[i],
xy=(x[i], y[i]),
xytext=(5, 2),
textcoords='offset points',
ha='right',
va='bottom')
plt.show()
ref = plt.gcf()
ref.savefig('tsne, iter=350, dpi=100, words=allby5.png', dpi=100)