First, I converted each of those tweets into vectors by use SciKit Learn's CountVectorizer. This algorithm help convert a collection of text documents to a matrix of token counts.
I chose TF-IDF as the text mining method to weight the matrix I got.
I used SciKit Learn's pipeline capabilities to store a pipeline of workflow. This allowed me to set up all the transformations all at once.
I chose three models---Naive Bayes classifier algorithm, Random Forest Classifier and Light Gradient Boosting Model to train on.
Using gridSearchCV, for each hyperparameter of each model, I passed in a bunch of possible values, compared the roc_auc score, and chose the best parameters for each model.
For each chosen model, I found the optimal threshold, got accuracy over the test set for each model, and plotted ROC curve for them.
I chose MultinomialNB as the final model because it got the highest accuracy which is 75.74% and auc which is 0.8283.
import nltk
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from nltk.corpus import stopwords
import string
from tqdm import tqdm
import re
# Take a look like what the data looks like
complaints = [line.rstrip() for line in open('/Users/apple/Desktop/complaint.csv')]
noncomplaints = [line.rstrip() for line in open('/Users/apple/Desktop/noncomplaint.csv')]
# The data has three columns
complaints[0]
complaints = pd.read_csv('/Users/apple/Desktop/complaint.csv', sep=',')
noncomplaints = pd.read_csv('/Users/apple/Desktop/noncomplaint.csv', sep=',')
complaints['airline'].value_counts()
plt.figure(figsize=(10,6))
sns.countplot(x=complaints['airline'],palette = 'rainbow')
noncomplaints['airline'].value_counts()
plt.figure(figsize=(10,6))
sns.countplot(x=noncomplaints['airline'],palette = 'rainbow')
complaints['label']=0
noncomplaints['label']=1
data = pd.concat([complaints,noncomplaints])
data['tweet'] = data['tweet'].apply(lambda s:s.lower())
data.head()
def text_process(tweet):
"""
Takes in a string of text, then performs the following:
1. remove all mentions
2. Remove all punctuation
3. Remove all stopwords
4. Returns a list of the cleaned text
"""
# Some words which might indicate a certain sentiment are kept via a whitelist
whitelist = ["n't", "not", "no"]
# remove the @
tweet = re.sub(r'@\w+', '', tweet)
# Check characters to see if they are in punctuation
nopunc = [char for char in tweet if char not in string.punctuation]
# Join the characters again to form the string.
nopunc = ''.join(nopunc)
# Now just remove any stopwords, keep words in the whitelist and remove the @
return [word for word in nopunc.split() if (word.lower() not in stopwords.words('english')
or word.lower() in whitelist) and len(word) > 1]
data['tweet'].head(5).apply(text_process)
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(data['tweet'], data['label'], test_size=0.2)
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import Pipeline
def getPipeline(model):
"""
apply model using the pipeline
"""
pipeline = Pipeline([
('bow', CountVectorizer(analyzer=text_process)), # strings to token integer counts
('tfidf', TfidfTransformer()), # integer counts to weighted TF-IDF scores
('classifier', model), # train on TF-IDF vectors
])
return pipeline
from sklearn.model_selection import KFold, cross_val_score, GridSearchCV
from sklearn.metrics import accuracy_score, roc_curve, auc, roc_auc_score
def gridSearchCV(model, params):
"""
@param model: sklearn estimator
@param params (dict): Dictionary of possible parameters
@return cv_results (DataFrame)
"""
model_cv = GridSearchCV(model, param_grid=params, scoring='roc_auc', cv=5)
pipeline = getPipeline(model_cv)
pipeline.fit(X_train, y_train)
cv_results = pd.DataFrame(model_cv.cv_results_)[['params', 'mean_test_score']]
return cv_results
def evaluate(model, plotROC=False):
"""
1. Plot ROC AUC of the test set
2. Return the best threshold
"""
pipeline = getPipeline(model)
pipeline.fit(X_train, y_train)
probs = pipeline.predict_proba(X_test)
preds = probs[:,1]
fpr, tpr, threshold = roc_curve(y_test, preds)
roc_auc = auc(fpr, tpr)
print(f'AUC: {roc_auc:.4f}')
# Find optimal threshold
rocDf = pd.DataFrame({'fpr': fpr, 'tpr':tpr, 'threshold':threshold})
rocDf['tpr - fpr'] = rocDf.tpr - rocDf.fpr
optimalThreshold = rocDf.threshold[rocDf['tpr - fpr'].idxmax()]
print(optimalThreshold)
# Get accuracy over the test set
y_pred = np.where(preds >= optimalThreshold, 1, 0)
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy*100:.2f}%')
# Plot ROC AUC
if plotROC:
plt.title('Receiver Operating Characteristic')
plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()
from sklearn.naive_bayes import MultinomialNB
params = {'alpha': [1.6,1.7,1.8,1.9,2.0,2.1,2.2,2.3,2.4]}
mnb = MultinomialNB(fit_prior=True, class_prior=None)
print(gridSearchCV(mnb, params))
mnb = MultinomialNB(alpha=2.3, fit_prior=True, class_prior=None)
evaluate(mnb, plotROC=True)
from sklearn.ensemble import RandomForestClassifier
params = {'n_estimators': [175,200,500]}
rfc = RandomForestClassifier(random_state=1)
print(gridSearchCV(rfc, params))
rfc = RandomForestClassifier(n_estimators=500,random_state=1)
evaluate(rfc,plotROC=True)
from lightgbm import LGBMClassifier
params1 = {'learning_rate': [0.01,0.05,0.1]}
params2 = {'n_estimators': [100,200,500]}
params3 = {'num_leaves': [5,6,7,8,9,10]}
params4 = {'min_data_in_leaf': [3,4,5,6,7,8]}
params5 = {'max_depth': [5, 6, 7 ,8, 9]}
params6 = {'max_bin': [45,50,55,60,65]}
params7 = {'bagging_fraction': [0.7,0.72,0.75,0.78,0.8]}
params8 = {'bagging_freq': [3,4,5,6,7,8,9]}
params9 = {'feature_fraction': [0.2,0.22,0.24,0.26]}
params10 = {'feature_fraction_seed': [5,6,7,8,9]}
params11 = {'bagging_seed': [5,6,7,8,9]}
params12 = {'min_sum_hessian_in_leaf': [7,8,9,10,11,12]}
lightGBM = LGBMClassifier(learning_rate=0.01,
n_estimators=500,
num_leaves=5,
min_data_in_leaf=5,
max_depth=5,
max_bin=55,
bagging_fraction=0.8,
bagging_freq=4,
feature_fraction=0.22,
feature_fraction_seed=9,
bagging_seed=7,
min_sum_hessian_in_leaf=7)
print(gridSearchCV(lightGBM, params11))
lightGBM = LGBMClassifier(learning_rate=0.01,
n_estimators=500,
num_leaves=5,
min_data_in_leaf=5,
max_depth=5,
max_bin=55,
bagging_fraction=0.8,
bagging_freq=4,
feature_fraction=0.22,
feature_fraction_seed=9,
bagging_seed=7,
min_sum_hessian_in_leaf=7)
evaluate(lightGBM, plotROC=True)
mytest_set = pd.read_csv('/Users/apple/Downloads/test_data.csv', sep=',')
mytest_set.head()
mytest_set['tweet'] = mytest_set['tweet'].apply(lambda s:s.lower())
mytest_set['tweet'].head(5).apply(text_process)
test = mytest_set['tweet']
test.head()
X = data['tweet']
y = data['label']
final_model = getPipeline(mnb).fit(X, y)
# set the optimal threshold for prediction
predictions = final_model.predict_proba(test)[:, 1]
predictions = np.where(predictions >= 0.61,1,0)
mytest_set['predictions'] = predictions
mytest_set['predictions'].value_counts()
output = mytest_set[mytest_set['predictions']==1].drop(['Unnamed: 0','tid_not_to_be_used',
'airline','tag'],axis=1).reset_index(drop=True)
output.head()
mytest_set[mytest_set['predictions']==1]['tweet'].head(5).apply(text_process)
output.to_csv('/Users/apple/Desktop/output2.csv', sep=',', encoding='utf-8')
After reading all the noncomplaints I predict from the test set. I got 180 over 239 correct predictions.
The precision of my model is around 75.3%