Preliminary

In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:60% !important; }</style>"))
In [5]:
# For collapsable headings, table of contents

# !pip install jupyter_contrib_nbextensions
# !jupyter contrib nbextension install --user
# !pip install jupyter_nbextensions_configurator
# !jupyter nbextensions_configurator enable --user
In [6]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from PIL import Image
from sklearn import preprocessing
In [7]:
!ls
backup	       data_7000-20191105T200553Z-001  label.csv  perprocess.ipynb
Clean_1.ipynb  data_7000_new.csv	       Misc	  train.csv
In [8]:
def print_img(img_name, img_size=[18,10]):
    image_path = os.path.join(root_data_folder, img_name)
    img = Image.open(image_path)
    plt.figure(figsize=img_size)
    plt.imshow(img)
    plt.show()
In [9]:
root_data_folder = "data_7000-20191105T200553Z-001/data_7000"
data = pd.read_csv('data_7000_new.csv', header=None)

print(data.shape)
data.head().transpose()
(6601, 9)
Out[9]:
0 1 2 3 4
0 10_year_2r94rv.jpg 10_year_10-year-challenge_1547788782.jpeg 10_year_10yearchallenge-5c75f8b946e0fb0001edc7... 10_year_10-year-challenge-sweet-dee-edition-40... 10_year_10-year-challenge-with-no-filter-47-hi...
1 https://i.imgflip.com/2r94rv.jpg https://spiderimg.amarujala.com/assets/images/... https://www.lifewire.com/thmb/8wNfd94_meE9X2cp... https://pics.conservativememes.com/10-year-cha... https://pics.me.me/10-year-challenge-with-no-f...
2 LOOK THERE MY FRIEND LIGHTYEAR NOW ALL SOHALIK... The best of #10 YearChallenge! Completed in le... Sam Thorne @Strippin ( Follow Follow Saw every... 10 Year Challenge - Sweet Dee Edition 10 YEAR CHALLENGE WITH NO FILTER 47 Hilarious ...
3 LOOK THERE MY FRIEND LIGHTYEAR NOW ALL SOHALIK... The best of #10 YearChallenge! Completed in le... Sam Thorne @Strippin ( Follow Follow Saw every... 10 Year Challenge - Sweet Dee Edition 10 YEAR CHALLENGE WITH NO FILTER 47 Hilarious ...
4 hilarious not_funny very_funny very_funny hilarious
5 general general not_sarcastic twisted_meaning very_twisted
6 not_offensive not_offensive not_offensive very_offensive very_offensive
7 not_motivational motivational not_motivational motivational not_motivational
8 very_positive very_positive positive positive neutral

Remove NaNs

In [10]:
# Get NaN counts

print("NaN counts per Column")

for i in range(9):
    t = data.iloc[:,i]
    print("Column", i, ":",t[t.isna()].shape)
    
    if(i != 8): data.iloc[:,i] = t.fillna('nil')
    else: data.iloc[:,i] = t.fillna('neutral')
NaN counts per Column
Column 0 : (0,)
Column 1 : (0,)
Column 2 : (153,)
Column 3 : (9,)
Column 4 : (0,)
Column 5 : (0,)
Column 6 : (0,)
Column 7 : (7,)
Column 8 : (80,)
In [11]:
# Remove absurdities

t = data.iloc[:,8]
absurd_idx1 = np.where(t == 'positivechandler_Friday-Mood-AF.-meme-Friends-ChandlerBing.jpg')[0][0]
t[absurd_idx1] = 'none'
data.iloc[:,8] = t

absurd_idx2 = np.where(t == 'none')[0][0]
t[absurd_idx2] = 'neutral'
data.iloc[:,8] = t

print("Absurd data label index 1 :",absurd_idx1)
print("Absurd data label index 2 :",absurd_idx2)
Absurd data label index 1 : 670
Absurd data label index 2 : 670
In [12]:
# Looking at the absurd meme
# print_img(data.iloc[absurd_idx1,0])

Cleaning the Labels

In [13]:
outlier_indices = []
In [14]:
# Column 5
# Additional label : Hilariousness
# Reduce to : 4x ordinal [not_funny:0, hilarious:1, funny:2, very_funny:3]

cur_index = 4
incorrect_label_threshold_count = 100

# Get Column
t = data.iloc[:,cur_index]
labels, counts = np.unique(t, return_counts=True)

# Get Incorrect labels
incorrect_labels = []
incorrect_count = 0
for i, count in enumerate(counts):
    if(count < incorrect_label_threshold_count):
        incorrect_labels.append(labels[i])
        incorrect_count += count

# Get Outlier indices
cur_outlier_indices = []
for idx in range(len(data)):
    if(t[idx] in incorrect_labels):
        cur_outlier_indices.append(idx)
        
outlier_indices.append(cur_outlier_indices)
print("Incorrect labels : ", incorrect_count)


# Plot Graph
plt.figure(figsize=[25,5])
plt.bar(labels, counts)
plt.yticks(np.arange(0,2401,200))
plt.grid(axis='y')
plt.show()
Incorrect labels :  80
In [15]:
# Column 6
# Additional label : Intention
# Reduce to : 4x categorical [is_general, is_sarcastic, is_twisted, is_very_twisted]

cur_index = 5
incorrect_label_threshold_count = 100

# Get Column
t = data.iloc[:,cur_index]
labels, counts = np.unique(t, return_counts=True)

# Get Incorrect labels
incorrect_labels = []
incorrect_count = 0
for i, count in enumerate(counts):
    if(count < incorrect_label_threshold_count):
        incorrect_labels.append(labels[i])
        incorrect_count += count

# Get Outlier indices
cur_outlier_indices = []
for idx in range(len(data)):
    if(t[idx] in incorrect_labels):
        cur_outlier_indices.append(idx)
        
outlier_indices.append(cur_outlier_indices)
print("Incorrect labels : ", incorrect_count)

# Plot Graph
plt.figure(figsize=[25,5])
plt.bar(labels, counts)
plt.yticks(np.arange(0,3401,200))
plt.grid(axis='y')
plt.show()
Incorrect labels :  80
In [16]:
# Column 7
# Output label : Offensiveness
# Reduce to : 4x ordinal [not_offensive:0, slight:1, very_offensive:2, hateful_offensive:3]

cur_index = 6
incorrect_label_threshold_count = 100

# Get Incorrect labels
incorrect_labels = []
incorrect_count = 0
for i, count in enumerate(counts):
    if(count < incorrect_label_threshold_count):
        incorrect_labels.append(labels[i])
        incorrect_count += count

# Get Outlier indices
cur_outlier_indices = []
for idx in range(len(data)):
    if(t[idx] in incorrect_labels):
        cur_outlier_indices.append(idx)
        
outlier_indices.append(cur_outlier_indices)
print("Incorrect labels : ", incorrect_count)

# Plot Graph
plt.figure(figsize=[25,5])
plt.bar(labels, counts)
plt.yticks(np.arange(0,2601,200))
plt.grid(axis='y')
plt.show()
Incorrect labels :  80
In [17]:
# Column 8
# Output label : isMotivational
# Reduce to : Boolean [isMotivational]

cur_index = 7
incorrect_label_threshold_count = 100

# Get Column
t = data.iloc[:,cur_index]
labels, counts = np.unique(t, return_counts=True)

# Get Incorrect labels
incorrect_labels = []
incorrect_count = 0
for i, count in enumerate(counts):
    if(count < incorrect_label_threshold_count):
        incorrect_labels.append(labels[i])
        incorrect_count += count

# Get Outlier indices
cur_outlier_indices = []
for idx in range(len(data)):
    if(t[idx] in incorrect_labels):
        cur_outlier_indices.append(idx)
        
outlier_indices.append(cur_outlier_indices)
print("Incorrect labels : ", incorrect_count)

# Plot Graph
plt.figure(figsize=[25,5])
plt.bar(labels, counts)
plt.yticks(np.arange(0,4401,200))
plt.grid(axis='y')
plt.show()
Incorrect labels :  80
In [18]:
# Column 9
# Output label : Sentiment
# Reduce to : 5x Ordinal [very_negative:0, negative:1, neutral:2, postive:3, very_positive:4]

cur_index = 8

# Get Column
t = data.iloc[:,cur_index]
labels, counts = np.unique(t, return_counts=True)

# Plot Graph
plt.figure(figsize=[25,5])
plt.bar(labels, counts)
plt.yticks(np.arange(0,3001,200))
plt.grid(axis='y')
plt.show()
In [20]:
absurd_indices, absurd_counts = np.unique(outlier_indices[0] + outlier_indices[1] + outlier_indices[2] + outlier_indices[3], return_counts=True)

print("Counts of absurdities per incorrect data indice :")
print(absurd_counts)

print("\nAbsurd indices :")
print(absurd_indices)
Counts of absurdities per incorrect data indice :
[4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4
 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4
 4 4 4 4 4 4]

Absurd indices :
[ 117  174  351  434  578  587  601  667  684  772 1045 1167 1296 1304
 1355 1386 1490 1493 1651 1675 1888 1892 1982 1986 1992 2025 2044 2058
 2328 2576 2874 2923 2959 3042 3113 3342 3495 3536 3563 3598 3630 3670
 3674 3687 3698 3735 3774 3780 3940 4091 4143 4174 4312 4376 4544 4594
 4689 4943 4947 5060 5111 5144 5239 5251 5262 5382 5438 5465 5503 5700
 5817 5834 5952 5959 5994 6025 6105 6250 6360 6512]

Absurd memes

In [21]:
# for absurd_idx in absurd_indices:
#     print_img(data.iloc[absurd_idx,0])
In [ ]:
 

Cleaning the Data

In [22]:
absurd_indices = [117, 174, 351, 434, 578, 587, 601, 667, 684, 772, 1045, 1167, 1296, 1304, 1355, 1386, 1490, 1493, 1651, 1675, 1888, 1892, 1982, 1986, 1992, 2025, 2044, 2058, 2328, 2576, 2874, 2923, 2959, 3042, 3113, 3342, 3495, 3536, 3563, 3598, 3630, 3670, 3674, 3687, 3698, 3735, 3774, 3780, 3940, 4091, 4143, 4174, 4312, 4376, 4544, 4594, 4689, 4943, 4947, 5060, 5111, 5144, 5239, 5251, 5262, 5382, 5438, 5465, 5503, 5700, 5817, 5834, 5952, 5959, 5994, 6025, 6105, 6250, 6360, 6512]

mask = np.ones(data.shape[0])
mask[absurd_indices] = 0

mask = np.asarray(mask, dtype=bool)
cdata = data[mask]
cdata = pd.DataFrame(cdata, dtype=str)
cdata = cdata.set_index(np.arange(0,len(cdata)))
cdata.head(5)
Out[22]:
0 1 2 3 4 5 6 7 8
0 10_year_2r94rv.jpg https://i.imgflip.com/2r94rv.jpg LOOK THERE MY FRIEND LIGHTYEAR NOW ALL SOHALIK... LOOK THERE MY FRIEND LIGHTYEAR NOW ALL SOHALIK... hilarious general not_offensive not_motivational very_positive
1 10_year_10-year-challenge_1547788782.jpeg https://spiderimg.amarujala.com/assets/images/... The best of #10 YearChallenge! Completed in le... The best of #10 YearChallenge! Completed in le... not_funny general not_offensive motivational very_positive
2 10_year_10yearchallenge-5c75f8b946e0fb0001edc7... https://www.lifewire.com/thmb/8wNfd94_meE9X2cp... Sam Thorne @Strippin ( Follow Follow Saw every... Sam Thorne @Strippin ( Follow Follow Saw every... very_funny not_sarcastic not_offensive not_motivational positive
3 10_year_10-year-challenge-sweet-dee-edition-40... https://pics.conservativememes.com/10-year-cha... 10 Year Challenge - Sweet Dee Edition 10 Year Challenge - Sweet Dee Edition very_funny twisted_meaning very_offensive motivational positive
4 10_year_10-year-challenge-with-no-filter-47-hi... https://pics.me.me/10-year-challenge-with-no-f... 10 YEAR CHALLENGE WITH NO FILTER 47 Hilarious ... 10 YEAR CHALLENGE WITH NO FILTER 47 Hilarious ... hilarious very_twisted very_offensive not_motivational neutral
In [23]:
from matplotlib import gridspec

fig = plt.figure(figsize=[30,7])
gs = gridspec.GridSpec(1, 3, width_ratios=[4, 2, 5]) 

plt.subplot(gs[0])
plt.title('Offensiveness')
t = cdata.iloc[:,6]
labels, counts = np.unique(t, return_counts=True)
print(counts)
plt.bar(labels, counts)
plt.grid(axis='y') 

plt.subplot(gs[1])
plt.title('isMotivational')
t = cdata.iloc[:,7]
labels, counts = np.unique(t, return_counts=True)
plt.bar(labels, counts)
plt.grid(axis='y')

plt.subplot(gs[2])
plt.title('Sentiment')
t = cdata.iloc[:,8]
labels, counts = np.unique(t, return_counts=True)
plt.bar(labels, counts)
plt.grid(axis='y')

plt.show()
[ 207 2542 2408 1364]
In [43]:
print("BaseLines :")
print(" Offensiveness : {:.4f}".format(len( cdata.iloc[:,6][cdata.iloc[:,6] == 'not_offensive']) / len(cdata)))
print(" isMotivational : {:.4f}".format(len( cdata.iloc[:,7][cdata.iloc[:,7] == 'not_motivational']) / len(cdata)))
print(" Sentiment : {:.4f}".format(len( cdata.iloc[:,8][cdata.iloc[:,8] == 'positive']) / len(cdata)))
BaseLines :
 Offensiveness : 0.3898
 isMotivational : 0.6485
 Sentiment : 0.4466
In [ ]:
 
In [44]:
from matplotlib import gridspec

fig = plt.figure(figsize=[30,7])
gs = gridspec.GridSpec(1, 2, width_ratios=[4, 4]) 

plt.subplot(gs[0])
plt.title('Hilariousness')
t = cdata.iloc[:,4]
labels, counts = np.unique(t, return_counts=True)
plt.bar(labels, counts)
plt.grid(axis='y') 

plt.subplot(gs[1])
plt.title('Intention')
t = cdata.iloc[:,5]
labels, counts = np.unique(t, return_counts=True)
plt.bar(labels, counts)
plt.grid(axis='y')

plt.show()
In [ ]:
 
In [ ]:
 

Converting Hilariousness into Categorical features

In [45]:
# # Label encode
# hilariousness = cdata.iloc[:,4]
# le = preprocessing.LabelEncoder()
# transf = le.fit_transform(hilariousness)
# print("Identified :", le.classes_)

# # Break into different categories
# hilariousness = np.zeros([hilariousness.shape[0],4])

# for idx,types in enumerate(transf):
#     hilariousness[idx][types] = 1
    
# print("Transformed :", le.inverse_transform([0,1,2,3]))

# # Make Dataframe
# cols = ['isFunny', 'isHilarious', 'isNotFunny', 'isVeryFunny']
# hilariousness = pd.DataFrame(hilariousness, columns = cols, dtype=np.int8)
# hilariousness_ordinal = hilariousness
# hilariousness_ordinal
In [46]:
hilariousness = pd.DataFrame(cdata.iloc[:,4])
hilariousness.columns = ['Hilariousness']

hilariousness = hilariousness.replace({'Hilariousness': {
                            'not_funny':0, 
                            'funny':1, 
                            'very_funny':2, 
                            'hilarious':3
                            }})

hilariousness
Out[46]:
Hilariousness
0 3
1 0
2 2
3 2
4 3
... ...
6516 2
6517 1
6518 1
6519 0
6520 0

6521 rows × 1 columns

Converting Intentions into Categorical features

In [47]:
# # Label encode
# intentions = cdata.iloc[:,5]
# le = preprocessing.LabelEncoder()
# transf = le.fit_transform(intentions)
# print("Identified :", le.classes_)

# # Break into different categories
# intentions = np.zeros([intentions.shape[0],4])

# for idx,types in enumerate(transf):
#     intentions[idx][types] = 1
    
# print("Transformed :", le.inverse_transform([0,1,2,3]))

# # Make Dataframe
# cols = ['isGeneral', 'isNotSarcastic', 'isTwisted', 'isVeryTwisted']
# intentions = pd.DataFrame(intentions, columns = cols, dtype=np.int8)
# intentions_ordinal = intentions
# intentions_ordinal
In [48]:
intentions = pd.DataFrame(cdata.iloc[:,5])
intentions.columns = ['Intentions']

intentions = intentions.replace({'Intentions': {
                            'general':0, 
                            'not_sarcastic':1, 
                            'twisted_meaning':2, 
                            'very_twisted':3, 
                            }})

intentions
Out[48]:
Intentions
0 0
1 0
2 1
3 2
4 3
... ...
6516 2
6517 2
6518 0
6519 2
6520 1

6521 rows × 1 columns

Turning labels into Ordinals

In [49]:
labels = cdata.iloc[:,6:]
labels.columns = ['Offensiveness', 'isMotivational', 'Sentiment']
labels.head()
Out[49]:
Offensiveness isMotivational Sentiment
0 not_offensive not_motivational very_positive
1 not_offensive motivational very_positive
2 not_offensive not_motivational positive
3 very_offensive motivational positive
4 very_offensive not_motivational neutral
In [50]:
labels = labels.replace({'Offensiveness': {
                            'not_offensive':0, 
                            'slight':1, 
                            'very_offensive':2, 
                            'hateful_offensive':3
                            }})

labels = labels.replace({'isMotivational': {
                            'not_motivational':0, 
                            'motivational':1
                            }})

labels = labels.replace({'Sentiment': {
                            'very_negative':0, 
                            'negative':1, 
                            'neutral':2, 
                            'positive':3, 
                            'very_positive':4
                            }})

labels.head()
Out[50]:
Offensiveness isMotivational Sentiment
0 0 0 4
1 0 1 4
2 0 0 3
3 2 1 3
4 2 0 2
In [ ]:
 
In [ ]:
 

Converting templates into Categorical features

In [51]:
template_list = [
    "10_year",
    "avengers",
    "baby",
    "barney",
    "bean",
    "best",
    "bethe",
    "big_bang",
    "cat_U",
    "chandler",
    "chuck",
    "country",
    "deadpool",
    "decaprio",
    "distracted_bf",
    "dr_evil",
    "drunk_baby",
    "feminist",
    "friends",
    "gene",
    "gf",
    "giorgio",
    "godfather",
    "got",
    "harry",
    "harvey",
    "hillary",
    "hitler",
    "jim",
    "joker",
    "kim",
    "liam",
    "lor",
    "minion",
    "misog",
    "modi",
    "morpheous",
    "mrbri",
    "nemo",
    "obama",
    "penguin",
    "pepe",
    "picard",
    "putin",
    "racis",
    "rahul",
    "seal",
    "sexist",
    "skeptical",
    "spector",
    "spiderman",
    "sports",
    "stevejobs",
    "success",
    "tech",
    "third",
    "titanic",
    "tom",
    "trump",
    "x_men",
    "zeck",
]
In [52]:
file_names = cdata.iloc[:, 0]
file_templates_list = []

for file_name in file_names:
    file_templates = np.zeros([1,len(template_list)])
    
    for i,template in enumerate(template_list):
        if(file_name.find(template) != -1):            
            file_templates[0,i] = 1
#             print(template)
#     print(file_name)
#     print('------------------------------------------------------------------------------------------')

    file_templates_list.append(file_templates)
In [53]:
file_templates_list = np.asarray(file_templates_list).squeeze()
templates_df = pd.DataFrame(file_templates_list, columns=['template_'+template for template in template_list], dtype=np.int8)
templates_df
Out[53]:
template_10_year template_avengers template_baby template_barney template_bean template_best template_bethe template_big_bang template_cat_U template_chandler ... template_sports template_stevejobs template_success template_tech template_third template_titanic template_tom template_trump template_x_men template_zeck
0 1 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
1 1 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
2 1 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
3 1 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
4 1 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
6516 0 0 0 0 0 1 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
6517 0 0 0 0 0 1 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
6518 0 0 0 0 0 1 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
6519 0 0 0 0 0 1 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
6520 0 0 0 0 0 1 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0

6521 rows × 61 columns

In [ ]:
 
In [ ]:
 

Cleaning Text

In [ ]:
import nltk
import spacy
import string
import re
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer 
from nltk.tokenize import word_tokenize 
In [55]:
### DEFAULT word preprocessing funcitons

# Turn to lowercase
def text_lowercase(text): 
    return text.lower() 

# Remove numbers 
def remove_numbers(text): 
    result = re.sub(r'\d+', '', text) 
    return result 

# Remove punctuation 
# TODO : Add space to prevent concat
def remove_punctuation(word_array):
    output_word_array = []
    
    for word in word_array:
        translator = str.maketrans('', '', string.punctuation) 
        clean_word = word.translate(translator)
        if(clean_word != ''): output_word_array.append(clean_word)
    
    return output_word_array

# Remove stopwords function 
def remove_stopwords(text): 
    stop_words = set(stopwords.words("english")) 
    word_tokens = word_tokenize(text) 
    filtered_text = [word for word in word_tokens if word not in stop_words] 
    return filtered_text 

# Lemmatize string 
lemmatizer = WordNetLemmatizer() 
def lemmatize_word(word_array):
    lemmatized_word_array = []
    for word in word_array:
        word_tokens = word_tokenize(word) 
        lemmas = [lemmatizer.lemmatize(word, pos ='v') for word in word_tokens] 
        lemmatized_word_array += lemmas
    
    return lemmatized_word_array 
  
print(text_lowercase("Hey, did you know that the summer break is coming? Amazing right !! It's only 5 more days !!"))
print(remove_numbers("There are 3 balls in this bag, and 12 in the other one.")  )
print(remove_punctuation("Hey, did you know that the #summer_break is... coming? Amazing @right !! It's only 5 more days !!".split()))
print(remove_stopwords("This is a sample sentence and we are going to remove the stopwords from this.")) 
print(lemmatize_word('Data science uses scientific methods algorithms and many types of processes'.split())) 
hey, did you know that the summer break is coming? amazing right !! it's only 5 more days !!
There are  balls in this bag, and  in the other one.
['Hey', 'did', 'you', 'know', 'that', 'the', 'summerbreak', 'is', 'coming', 'Amazing', 'right', 'Its', 'only', '5', 'more', 'days']
['This', 'sample', 'sentence', 'going', 'remove', 'stopwords', '.']
['Data', 'science', 'use', 'scientific', 'methods', 'algorithms', 'and', 'many', 'type', 'of', 'process']
In [56]:
### CUSTOM word preprocessing functions
### Domain specific

# Handle website names.
# Remove only the words containg '.com', '.co', '.net' ?
# Remove all words containg '.' except when dot is last character.
# Do befoer removing puncttuation.
def remove_word_containing_dot(text_arr):
    clean_text_arr = []
    
    for word in text_arr:
        if(word.find('.') == -1 or word.find('.') == len(word)-1):
            clean_text_arr.append(word)
    
    return clean_text_arr

# Handle twitter stuff.
# Remove connon words.
# Remove month names.
def remove_twitter(text_arr):
    banned_words = ["retweets", "likes", "k", "pm", "follow"]
    months = ["jan", "feb", "mar", "apr", "may", "jun", "jul", "aug", "sep", "oct", "nov", "dec"]
    
    conc_remove_words = banned_words + months
    # print(conc_remove_words)
    
    clean_text_arr = []
        
    for word in text_arr:
        if(word not in conc_remove_words):
            clean_text_arr.append(word)
    
    return clean_text_arr
# Handle twitter timestamp
# Remove all words after the first word containg ':'
# Bad idea see meme at idx:6.
# SKIPPED for now
def remove_all_text_after_colon(text):
    return text

print(remove_word_containing_dot("Me a memee text. SnoopyMems.com. Here more meme text. AnotherWebsite.net".split()))
print(remove_twitter("Kudus to @narendramodi ji 8:05 PM - 16 Jan 2019 from Mumbai, India".split()))
['Me', 'a', 'memee', 'text.', 'Here', 'more', 'meme', 'text.']
['Kudus', 'to', '@narendramodi', 'ji', '8:05', 'PM', '-', '16', 'Jan', '2019', 'from', 'Mumbai,', 'India']
In [57]:
# TODO : Dont remove 2009, 2019 as they actually contain meme context while removing numbers.


def clean_text(input_text):

    string = text_lowercase(input_text)
    string = remove_numbers(string)
    text_arr = string.split()
    
    text_arr = remove_word_containing_dot(text_arr)
    text_arr = remove_punctuation(text_arr)
    text_arr = remove_twitter(text_arr)
#     arr = remove_stopwords(string)
    
    text_arr = lemmatize_word(text_arr)
    return text_arr

print(clean_text("Hey, did you know that 4 the summer Break is coming? Amazing right !! It's only 5 more days !!"))
print(clean_text("Me a memee text. SnoopyMems.com. Here more meme text. AnotherWebsite.net"))
print(clean_text("Kudus to @narendramodi ji 8:05 PM - 16 Jan 2019 from Mumbai, India"))
print(clean_text("Sam Thorne @Strippin ( Follow Follow Saw everyone posting these 2009 vs 2019 pics so here's mine 6:23 PM - 12 Jan 2019 O 636 Retweets 3 224 LIKES 65 636 3.2K "))
['hey', 'do', 'you', 'know', 'that', 'the', 'summer', 'break', 'be', 'come', 'amaze', 'right', 'its', 'only', 'more', 'days']
['me', 'a', 'memee', 'text', 'here', 'more', 'meme', 'text']
['kudus', 'to', 'narendramodi', 'ji', 'from', 'mumbai', 'india']
['sam', 'thorne', 'strippin', 'saw', 'everyone', 'post', 'these', 'vs', 'pics', 'so', 'heres', 'mine', 'o']
In [ ]:
 
In [58]:
# Examine results

desc1 = data.iloc[:, 2]
desc2 = data.iloc[:, 3]

for i in range(50):
    print("#",i)
    text = desc2[i]
    print(text)
    print(clean_text(text))
    print_img(data.iloc[i,0])
    print("-------------------------------------------------------------------------")
    
# 0
LOOK THERE MY FRIEND LIGHTYEAR NOW ALL SOHALIKUT TREND PLAY THE 10 YEARS CHALLENGE AT FACEBOOK imgflip.com 
['look', 'there', 'my', 'friend', 'lightyear', 'now', 'all', 'sohalikut', 'trend', 'play', 'the', 'years', 'challenge', 'at', 'facebook']