Recurrent Neural Network: LSTM/GRU models
Thanks for the inspiration of this blog: https://towardsdatascience.com/machine-learning-word-embedding-sentiment-classification-using-keras-b83c28087456
There are 7613 tweets in the training set and 3263 tweets in the testing set. Each sample in the train and test set has the following information:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from google.colab import drive
drive.mount('/content/drive')
train = pd.read_csv('/content/drive/My Drive/Colab Notebooks/twitter disaster/train.csv')
test = pd.read_csv('/content/drive/My Drive/Colab Notebooks/twitter disaster/test.csv')
train.head()
train.shape
test.shape
test.head()
train.keyword.isna().sum()
train.location.isna().sum()
kw = train.keyword.value_counts().head(10).reset_index()
kw.columns=['keyword','frequency']
loc = train.location.value_counts().head(10).reset_index()
loc.columns=['location','frequency']
def plot_frequency(data,title):
plt.figure(figsize=(10,6))
plt.title('Most Frequent '+title,fontsize=20,fontweight='bold', pad=20)
sns.barplot(x=title,y='frequency',data=data,palette='Set2')
plot_frequency(kw,'keyword')
plot_frequency(loc,'location')
train.keyword.nunique()
imp = train.groupby('keyword').sum().reset_index().sort_values('target',ascending=False).head(20)
wdlist = list(imp.keyword)
def getwordcloud(ls):
word = ' '
for i in range(len(ls)):
word = word + ls[i] + ' '
wordcloud = WordCloud(width = 800, height = 800,
background_color ='white',
min_font_size = 10).generate(word)
plt.figure(figsize = (10, 5), facecolor = None)
plt.imshow(wordcloud)
plt.axis("off")
plt.tight_layout(pad = 0)
plt.show()
getwordcloud(wdlist)
import nltk
nltk.download('punkt')
nltk.download('stopwords')
def text_processing(data):
tweets = list()
lines = data.text.values.tolist()
"""
create a word list:
1. convert all the words in the tweets to lower case
2. remove punctuation from each word
3. remove remaining tokens that are not alphabetic
4. filter out stop words
"""
for line in lines:
tokens = word_tokenize(line)
# convert to lower case
tokens = [w.lower() for w in tokens]
# remove punctuation from each word
table = str.maketrans('','',string.punctuation)
stripped = [w.translate(table) for w in tokens]
# remove remaining tokens that are not alphabetic
words = [word for word in stripped if word.isalpha()]
# filter out stop words
my_stopwords = ['http','https','amp','nt']
stop_words = set(stopwords.words('english'))
words = [word for word in words if word not in stop_words and word not in my_stopwords]
tweets.append(words)
return tweets
train_text = text_processing(train)
test_text = text_processing(test)
len(train_text)
len(test_text)
def load_word2vec():
""" Load Word2Vec Vectors
Return:
wv_from_bin: All 3 million embeddings, each lengh 300
"""
import gensim.downloader as api
wv_from_bin = api.load("word2vec-google-news-300") # load 300-dimension word vectors
vocab = list(wv_from_bin.vocab.keys())
print("Loaded vocab size %i" % len(vocab))
return wv_from_bin
wv_from_bin = load_word2vec()
from tensorflow.python.keras.preprocessing.text import Tokenizer
from tensorflow.python.keras.preprocessing.sequence import pad_sequences
def get_pad(data):
# vectorize the text
tokenizer_obj = Tokenizer()
tokenizer_obj.fit_on_texts(data)
sequences = tokenizer_obj.texts_to_sequences(data)
# pad sequences
word_index = tokenizer_obj.word_index
print('Found %s unique tokens' % len(word_index))
# find the max lengh of each sequence
find_max=[]
for i in range(len(sequences)):
find_max.append(len(sequences[i]))
max_length = max(find_max)
text_pad = pad_sequences(sequences,maxlen=max_length)
print('Shape of text tensor:',text_pad.shape)
return text_pad
def get_wordindex(data):
# vectorize the text
tokenizer_obj = Tokenizer()
tokenizer_obj.fit_on_texts(data)
word_index = tokenizer_obj.word_index
return word_index
def get_maxlength(data):
# vectorize the text
tokenizer_obj = Tokenizer()
tokenizer_obj.fit_on_texts(data)
sequences = tokenizer_obj.texts_to_sequences(data)
# find the max lengh of each sequence
find_max=[]
for i in range(len(sequences)):
find_max.append(len(sequences[i]))
max_length = max(find_max)
def get_embedding(word_index):
# create a matrix for the vocabulary of our training set, the row represents each vocabulary,
# the column represents each dimention we created for this vocabulary
embedding = np.zeros((len(word_index)+1, 300))
for word in word_index.keys():
if word in wv_from_bin.vocab.keys():
embedding[word_index[word], :] = np.array(wv_from_bin.word_vec(word))
else:
embedding[word_index[word], :] = np.random.randn(300)
return embedding
train_pad = get_pad(train_text)
word_index = get_wordindex(train_text)
embedding = get_embedding(word_index)
max_length = get_maxlength(train_text)
from keras.models import Sequential
from keras.layers import Dense,Embedding, LSTM, GRU,Bidirectional, BatchNormalization, Dropout
from keras.layers.embeddings import Embedding
from keras.initializers import Constant
-
embedding_dim = 300
Embedding_layer = Embedding((len(word_index)+1),embedding_dim,
embeddings_initializer=Constant(embedding),
input_length=max_length,
trainable = False)
model=Sequential()
model.add(Embedding_layer)
model.add(Bidirectional(GRU(32,dropout=0.2,recurrent_dropout=0.1,return_sequences=True)))
model.add(Bidirectional(GRU(32,dropout=0.2,recurrent_dropout=0.1)))
model.add(Dense(1,activation='sigmoid'))
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
model.summary()
model.fit(train_pad,train.target.values,batch_size=128,epochs=15, validation_split = 0.2,verbose=2)
model2=Sequential()
model2.add(Embedding_layer)
model2.add(Dropout(0.2))
model2.add(Bidirectional(LSTM(256)))
model2.add(Dropout(0.2))
model2.add(Dense(512, activation='relu'))
model2.add(Dropout(0.5))
model2.add(BatchNormalization())
model2.add(Dense(1,activation='sigmoid'))
model2.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
model2.summary()
model2.fit(train_pad,train.target.values,batch_size=128,epochs=15, validation_split = 0.2,verbose=2)
test_pad = get_pad(test_text)
preds = model.predict(test_pad)
predictions = np.where(preds >= 0.9,1,0)
test_id = test.id
output = pd.DataFrame({'id':test_id,'target':list(predictions)})
output['target'] = output['target'].str.get(0)
output.head()
output.shape
output.to_csv('/content/drive/My Drive/Colab Notebooks/twitter disaster/output.csv', sep=',', encoding='utf-8')