from IPython.core.display import display, HTML
display(HTML("<style>.container { width:60% !important; }</style>"))
# For collapsable headings, table of contents
# !pip install jupyter_contrib_nbextensions
# !jupyter contrib nbextension install --user
# !pip install jupyter_nbextensions_configurator
# !jupyter nbextensions_configurator enable --user
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from PIL import Image
from sklearn import preprocessing
!ls
def print_img(img_name, img_size=[18,10]):
image_path = os.path.join(root_data_folder, img_name)
img = Image.open(image_path)
plt.figure(figsize=img_size)
plt.imshow(img)
plt.show()
root_data_folder = "data_7000-20191105T200553Z-001/data_7000"
data = pd.read_csv('data_7000_new.csv', header=None)
print(data.shape)
data.head().transpose()
# Get NaN counts
print("NaN counts per Column")
for i in range(9):
t = data.iloc[:,i]
print("Column", i, ":",t[t.isna()].shape)
if(i != 8): data.iloc[:,i] = t.fillna('nil')
else: data.iloc[:,i] = t.fillna('neutral')
# Remove absurdities
t = data.iloc[:,8]
absurd_idx1 = np.where(t == 'positivechandler_Friday-Mood-AF.-meme-Friends-ChandlerBing.jpg')[0][0]
t[absurd_idx1] = 'none'
data.iloc[:,8] = t
absurd_idx2 = np.where(t == 'none')[0][0]
t[absurd_idx2] = 'neutral'
data.iloc[:,8] = t
print("Absurd data label index 1 :",absurd_idx1)
print("Absurd data label index 2 :",absurd_idx2)
# Looking at the absurd meme
# print_img(data.iloc[absurd_idx1,0])
outlier_indices = []
# Column 5
# Additional label : Hilariousness
# Reduce to : 4x ordinal [not_funny:0, hilarious:1, funny:2, very_funny:3]
cur_index = 4
incorrect_label_threshold_count = 100
# Get Column
t = data.iloc[:,cur_index]
labels, counts = np.unique(t, return_counts=True)
# Get Incorrect labels
incorrect_labels = []
incorrect_count = 0
for i, count in enumerate(counts):
if(count < incorrect_label_threshold_count):
incorrect_labels.append(labels[i])
incorrect_count += count
# Get Outlier indices
cur_outlier_indices = []
for idx in range(len(data)):
if(t[idx] in incorrect_labels):
cur_outlier_indices.append(idx)
outlier_indices.append(cur_outlier_indices)
print("Incorrect labels : ", incorrect_count)
# Plot Graph
plt.figure(figsize=[25,5])
plt.bar(labels, counts)
plt.yticks(np.arange(0,2401,200))
plt.grid(axis='y')
plt.show()
# Column 6
# Additional label : Intention
# Reduce to : 4x categorical [is_general, is_sarcastic, is_twisted, is_very_twisted]
cur_index = 5
incorrect_label_threshold_count = 100
# Get Column
t = data.iloc[:,cur_index]
labels, counts = np.unique(t, return_counts=True)
# Get Incorrect labels
incorrect_labels = []
incorrect_count = 0
for i, count in enumerate(counts):
if(count < incorrect_label_threshold_count):
incorrect_labels.append(labels[i])
incorrect_count += count
# Get Outlier indices
cur_outlier_indices = []
for idx in range(len(data)):
if(t[idx] in incorrect_labels):
cur_outlier_indices.append(idx)
outlier_indices.append(cur_outlier_indices)
print("Incorrect labels : ", incorrect_count)
# Plot Graph
plt.figure(figsize=[25,5])
plt.bar(labels, counts)
plt.yticks(np.arange(0,3401,200))
plt.grid(axis='y')
plt.show()
# Column 7
# Output label : Offensiveness
# Reduce to : 4x ordinal [not_offensive:0, slight:1, very_offensive:2, hateful_offensive:3]
cur_index = 6
incorrect_label_threshold_count = 100
# Get Incorrect labels
incorrect_labels = []
incorrect_count = 0
for i, count in enumerate(counts):
if(count < incorrect_label_threshold_count):
incorrect_labels.append(labels[i])
incorrect_count += count
# Get Outlier indices
cur_outlier_indices = []
for idx in range(len(data)):
if(t[idx] in incorrect_labels):
cur_outlier_indices.append(idx)
outlier_indices.append(cur_outlier_indices)
print("Incorrect labels : ", incorrect_count)
# Plot Graph
plt.figure(figsize=[25,5])
plt.bar(labels, counts)
plt.yticks(np.arange(0,2601,200))
plt.grid(axis='y')
plt.show()
# Column 8
# Output label : isMotivational
# Reduce to : Boolean [isMotivational]
cur_index = 7
incorrect_label_threshold_count = 100
# Get Column
t = data.iloc[:,cur_index]
labels, counts = np.unique(t, return_counts=True)
# Get Incorrect labels
incorrect_labels = []
incorrect_count = 0
for i, count in enumerate(counts):
if(count < incorrect_label_threshold_count):
incorrect_labels.append(labels[i])
incorrect_count += count
# Get Outlier indices
cur_outlier_indices = []
for idx in range(len(data)):
if(t[idx] in incorrect_labels):
cur_outlier_indices.append(idx)
outlier_indices.append(cur_outlier_indices)
print("Incorrect labels : ", incorrect_count)
# Plot Graph
plt.figure(figsize=[25,5])
plt.bar(labels, counts)
plt.yticks(np.arange(0,4401,200))
plt.grid(axis='y')
plt.show()
# Column 9
# Output label : Sentiment
# Reduce to : 5x Ordinal [very_negative:0, negative:1, neutral:2, postive:3, very_positive:4]
cur_index = 8
# Get Column
t = data.iloc[:,cur_index]
labels, counts = np.unique(t, return_counts=True)
# Plot Graph
plt.figure(figsize=[25,5])
plt.bar(labels, counts)
plt.yticks(np.arange(0,3001,200))
plt.grid(axis='y')
plt.show()
absurd_indices, absurd_counts = np.unique(outlier_indices[0] + outlier_indices[1] + outlier_indices[2] + outlier_indices[3], return_counts=True)
print("Counts of absurdities per incorrect data indice :")
print(absurd_counts)
print("\nAbsurd indices :")
print(absurd_indices)
# for absurd_idx in absurd_indices:
# print_img(data.iloc[absurd_idx,0])
absurd_indices = [117, 174, 351, 434, 578, 587, 601, 667, 684, 772, 1045, 1167, 1296, 1304, 1355, 1386, 1490, 1493, 1651, 1675, 1888, 1892, 1982, 1986, 1992, 2025, 2044, 2058, 2328, 2576, 2874, 2923, 2959, 3042, 3113, 3342, 3495, 3536, 3563, 3598, 3630, 3670, 3674, 3687, 3698, 3735, 3774, 3780, 3940, 4091, 4143, 4174, 4312, 4376, 4544, 4594, 4689, 4943, 4947, 5060, 5111, 5144, 5239, 5251, 5262, 5382, 5438, 5465, 5503, 5700, 5817, 5834, 5952, 5959, 5994, 6025, 6105, 6250, 6360, 6512]
mask = np.ones(data.shape[0])
mask[absurd_indices] = 0
mask = np.asarray(mask, dtype=bool)
cdata = data[mask]
cdata = pd.DataFrame(cdata, dtype=str)
cdata = cdata.set_index(np.arange(0,len(cdata)))
cdata.head(5)
from matplotlib import gridspec
fig = plt.figure(figsize=[30,7])
gs = gridspec.GridSpec(1, 3, width_ratios=[4, 2, 5])
plt.subplot(gs[0])
plt.title('Offensiveness')
t = cdata.iloc[:,6]
labels, counts = np.unique(t, return_counts=True)
print(counts)
plt.bar(labels, counts)
plt.grid(axis='y')
plt.subplot(gs[1])
plt.title('isMotivational')
t = cdata.iloc[:,7]
labels, counts = np.unique(t, return_counts=True)
plt.bar(labels, counts)
plt.grid(axis='y')
plt.subplot(gs[2])
plt.title('Sentiment')
t = cdata.iloc[:,8]
labels, counts = np.unique(t, return_counts=True)
plt.bar(labels, counts)
plt.grid(axis='y')
plt.show()
print("BaseLines :")
print(" Offensiveness : {:.4f}".format(len( cdata.iloc[:,6][cdata.iloc[:,6] == 'not_offensive']) / len(cdata)))
print(" isMotivational : {:.4f}".format(len( cdata.iloc[:,7][cdata.iloc[:,7] == 'not_motivational']) / len(cdata)))
print(" Sentiment : {:.4f}".format(len( cdata.iloc[:,8][cdata.iloc[:,8] == 'positive']) / len(cdata)))
from matplotlib import gridspec
fig = plt.figure(figsize=[30,7])
gs = gridspec.GridSpec(1, 2, width_ratios=[4, 4])
plt.subplot(gs[0])
plt.title('Hilariousness')
t = cdata.iloc[:,4]
labels, counts = np.unique(t, return_counts=True)
plt.bar(labels, counts)
plt.grid(axis='y')
plt.subplot(gs[1])
plt.title('Intention')
t = cdata.iloc[:,5]
labels, counts = np.unique(t, return_counts=True)
plt.bar(labels, counts)
plt.grid(axis='y')
plt.show()
# # Label encode
# hilariousness = cdata.iloc[:,4]
# le = preprocessing.LabelEncoder()
# transf = le.fit_transform(hilariousness)
# print("Identified :", le.classes_)
# # Break into different categories
# hilariousness = np.zeros([hilariousness.shape[0],4])
# for idx,types in enumerate(transf):
# hilariousness[idx][types] = 1
# print("Transformed :", le.inverse_transform([0,1,2,3]))
# # Make Dataframe
# cols = ['isFunny', 'isHilarious', 'isNotFunny', 'isVeryFunny']
# hilariousness = pd.DataFrame(hilariousness, columns = cols, dtype=np.int8)
# hilariousness_ordinal = hilariousness
# hilariousness_ordinal
hilariousness = pd.DataFrame(cdata.iloc[:,4])
hilariousness.columns = ['Hilariousness']
hilariousness = hilariousness.replace({'Hilariousness': {
'not_funny':0,
'funny':1,
'very_funny':2,
'hilarious':3
}})
hilariousness
# # Label encode
# intentions = cdata.iloc[:,5]
# le = preprocessing.LabelEncoder()
# transf = le.fit_transform(intentions)
# print("Identified :", le.classes_)
# # Break into different categories
# intentions = np.zeros([intentions.shape[0],4])
# for idx,types in enumerate(transf):
# intentions[idx][types] = 1
# print("Transformed :", le.inverse_transform([0,1,2,3]))
# # Make Dataframe
# cols = ['isGeneral', 'isNotSarcastic', 'isTwisted', 'isVeryTwisted']
# intentions = pd.DataFrame(intentions, columns = cols, dtype=np.int8)
# intentions_ordinal = intentions
# intentions_ordinal
intentions = pd.DataFrame(cdata.iloc[:,5])
intentions.columns = ['Intentions']
intentions = intentions.replace({'Intentions': {
'general':0,
'not_sarcastic':1,
'twisted_meaning':2,
'very_twisted':3,
}})
intentions
labels = cdata.iloc[:,6:]
labels.columns = ['Offensiveness', 'isMotivational', 'Sentiment']
labels.head()
labels = labels.replace({'Offensiveness': {
'not_offensive':0,
'slight':1,
'very_offensive':2,
'hateful_offensive':3
}})
labels = labels.replace({'isMotivational': {
'not_motivational':0,
'motivational':1
}})
labels = labels.replace({'Sentiment': {
'very_negative':0,
'negative':1,
'neutral':2,
'positive':3,
'very_positive':4
}})
labels.head()
template_list = [
"10_year",
"avengers",
"baby",
"barney",
"bean",
"best",
"bethe",
"big_bang",
"cat_U",
"chandler",
"chuck",
"country",
"deadpool",
"decaprio",
"distracted_bf",
"dr_evil",
"drunk_baby",
"feminist",
"friends",
"gene",
"gf",
"giorgio",
"godfather",
"got",
"harry",
"harvey",
"hillary",
"hitler",
"jim",
"joker",
"kim",
"liam",
"lor",
"minion",
"misog",
"modi",
"morpheous",
"mrbri",
"nemo",
"obama",
"penguin",
"pepe",
"picard",
"putin",
"racis",
"rahul",
"seal",
"sexist",
"skeptical",
"spector",
"spiderman",
"sports",
"stevejobs",
"success",
"tech",
"third",
"titanic",
"tom",
"trump",
"x_men",
"zeck",
]
file_names = cdata.iloc[:, 0]
file_templates_list = []
for file_name in file_names:
file_templates = np.zeros([1,len(template_list)])
for i,template in enumerate(template_list):
if(file_name.find(template) != -1):
file_templates[0,i] = 1
# print(template)
# print(file_name)
# print('------------------------------------------------------------------------------------------')
file_templates_list.append(file_templates)
file_templates_list = np.asarray(file_templates_list).squeeze()
templates_df = pd.DataFrame(file_templates_list, columns=['template_'+template for template in template_list], dtype=np.int8)
templates_df
import nltk
import spacy
import string
import re
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
### DEFAULT word preprocessing funcitons
# Turn to lowercase
def text_lowercase(text):
return text.lower()
# Remove numbers
def remove_numbers(text):
result = re.sub(r'\d+', '', text)
return result
# Remove punctuation
# TODO : Add space to prevent concat
def remove_punctuation(word_array):
output_word_array = []
for word in word_array:
translator = str.maketrans('', '', string.punctuation)
clean_word = word.translate(translator)
if(clean_word != ''): output_word_array.append(clean_word)
return output_word_array
# Remove stopwords function
def remove_stopwords(text):
stop_words = set(stopwords.words("english"))
word_tokens = word_tokenize(text)
filtered_text = [word for word in word_tokens if word not in stop_words]
return filtered_text
# Lemmatize string
lemmatizer = WordNetLemmatizer()
def lemmatize_word(word_array):
lemmatized_word_array = []
for word in word_array:
word_tokens = word_tokenize(word)
lemmas = [lemmatizer.lemmatize(word, pos ='v') for word in word_tokens]
lemmatized_word_array += lemmas
return lemmatized_word_array
print(text_lowercase("Hey, did you know that the summer break is coming? Amazing right !! It's only 5 more days !!"))
print(remove_numbers("There are 3 balls in this bag, and 12 in the other one.") )
print(remove_punctuation("Hey, did you know that the #summer_break is... coming? Amazing @right !! It's only 5 more days !!".split()))
print(remove_stopwords("This is a sample sentence and we are going to remove the stopwords from this."))
print(lemmatize_word('Data science uses scientific methods algorithms and many types of processes'.split()))
### CUSTOM word preprocessing functions
### Domain specific
# Handle website names.
# Remove only the words containg '.com', '.co', '.net' ?
# Remove all words containg '.' except when dot is last character.
# Do befoer removing puncttuation.
def remove_word_containing_dot(text_arr):
clean_text_arr = []
for word in text_arr:
if(word.find('.') == -1 or word.find('.') == len(word)-1):
clean_text_arr.append(word)
return clean_text_arr
# Handle twitter stuff.
# Remove connon words.
# Remove month names.
def remove_twitter(text_arr):
banned_words = ["retweets", "likes", "k", "pm", "follow"]
months = ["jan", "feb", "mar", "apr", "may", "jun", "jul", "aug", "sep", "oct", "nov", "dec"]
conc_remove_words = banned_words + months
# print(conc_remove_words)
clean_text_arr = []
for word in text_arr:
if(word not in conc_remove_words):
clean_text_arr.append(word)
return clean_text_arr
# Handle twitter timestamp
# Remove all words after the first word containg ':'
# Bad idea see meme at idx:6.
# SKIPPED for now
def remove_all_text_after_colon(text):
return text
print(remove_word_containing_dot("Me a memee text. SnoopyMems.com. Here more meme text. AnotherWebsite.net".split()))
print(remove_twitter("Kudus to @narendramodi ji 8:05 PM - 16 Jan 2019 from Mumbai, India".split()))
# TODO : Dont remove 2009, 2019 as they actually contain meme context while removing numbers.
def clean_text(input_text):
string = text_lowercase(input_text)
string = remove_numbers(string)
text_arr = string.split()
text_arr = remove_word_containing_dot(text_arr)
text_arr = remove_punctuation(text_arr)
text_arr = remove_twitter(text_arr)
# arr = remove_stopwords(string)
text_arr = lemmatize_word(text_arr)
return text_arr
print(clean_text("Hey, did you know that 4 the summer Break is coming? Amazing right !! It's only 5 more days !!"))
print(clean_text("Me a memee text. SnoopyMems.com. Here more meme text. AnotherWebsite.net"))
print(clean_text("Kudus to @narendramodi ji 8:05 PM - 16 Jan 2019 from Mumbai, India"))
print(clean_text("Sam Thorne @Strippin ( Follow Follow Saw everyone posting these 2009 vs 2019 pics so here's mine 6:23 PM - 12 Jan 2019 O 636 Retweets 3 224 LIKES 65 636 3.2K "))
# Examine results
desc1 = data.iloc[:, 2]
desc2 = data.iloc[:, 3]
for i in range(50):
print("#",i)
text = desc2[i]
print(text)
print(clean_text(text))
print_img(data.iloc[i,0])
print("-------------------------------------------------------------------------")