# import useful packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
# read the raw data
Data = pd.read_csv("HMAHCC_COMP.csv")
testData = pd.read_csv("holdOut_processed.csv")
# serve as mark in order to distinguish training and testing data
testID = pd.DataFrame({'id':testData.id.unique()})
trainID = pd.DataFrame({'id':Data.id.unique()})
print(trainID.shape)
print(testID.shape)
# We only need to care about patient behavior before opoid naive because we are here to make predictions
trainData = Data[Data.Days <= 0]
fullData = pd.concat([trainData, testData], axis=0)
print(fullData.shape)
label = pd.read_csv("label.csv")
label.shape
# Merge train data with label
trainDataLabel = trainData.merge(label, on='id')
def analyze_attr(attr, df):
df1 = df[['id', attr, 'LTOP']].copy().groupby(['id', attr]).mean().reset_index()
df2 = df1.groupby([attr, 'LTOP']).count().reset_index()
df2.columns = [attr, 'LTOP', 'count']
df3 = df2.pivot_table(values='count', index=[attr], columns=['LTOP']).reset_index()
df3.columns = [attr, 'LTOP_0', 'LTOP_1']
df3['diff'] = df3.LTOP_1 - df3.LTOP_0
return df3.sort_values('diff', ascending=False)
def plot_attr(attr, df, size):
df1 = df[['id', attr, 'LTOP']].copy().groupby(['id', attr]).mean().reset_index()
plt.figure(figsize=(size, 4))
sns.countplot(x=attr, hue='LTOP', palette='coolwarm', data=df1)
EDA_data = Data.merge(label, on='id')
opioid = EDA_data[(pd.notnull(EDA_data['PAY_DAY_SUPPLY_CNT']))&(EDA_data['event_descr']=='RX Claim - Paid')]
#analyze_attr('event_attr5',trainDataLabel).head(5)
analyze_attr('event_attr1',opioid).head(5)
plot_attr('event_attr1',opioid,12)
analyze_attr('event_attr5',opioid)
analyze_attr('Specialty',opioid)
plot_attr('event_attr5',opioid,50)
plot_attr('Specialty',opioid,50)
opioidDay0Supplies = opioid[opioid.Days==0][['id','PAY_DAY_SUPPLY_CNT','LTOP']].groupby('id').max().reset_index() #7
opioidDay0Supplies.columns = ['id','opioidDay0Supplies','LTOP']
opioidDay0Supplies.head()
sns.boxplot(x='LTOP',y='opioidDay0Supplies',data=opioidDay0Supplies,palette='coolwarm')
OtherRX = EDA_data[(EDA_data['event_descr']=='RX Claim - Paid')& (EDA_data['Days']<=0)]
OtherRX.head()
def getcountdiff(attr,df,name):
df1 = df[['id',attr, 'LTOP','event_attr5']].copy().groupby(['id',attr,'LTOP']).count().reset_index()
df2 = df1.copy().groupby([attr,'LTOP']).mean().reset_index()
df2.columns=[attr,'LTOP','count']
df2.sort_values('count',ascending=False,inplace=True)
df3 = df2.pivot(index=attr, columns='LTOP', values='count').reset_index()
df3.columns=[name,'LTOP_0','LTOP_1']
df3['diff']=abs(df3['LTOP_0']-df3['LTOP_1'])
df3.sort_values('diff',ascending=False,inplace=True)
return df3.head(5)
def plotcountdiff(attr,df,name):
df1 = df[['id',attr, 'LTOP','event_attr5']].copy().groupby(['id',attr,'LTOP']).count().reset_index()
df2 = df1.copy().groupby([attr,'LTOP']).mean().reset_index()
df2.columns=[attr,'LTOP','count']
df2.sort_values('count',ascending=False,inplace=True)
df3 = df2.pivot(index=attr, columns='LTOP', values='count').reset_index()
df3.columns=[name,'LTOP_0','LTOP_1']
df3['diff']=abs(df3['LTOP_0']-df3['LTOP_1'])
df3.sort_values('diff',ascending=False,inplace=True)
df4 = df3.head(5).melt(id_vars=name,value_vars=['LTOP_0','LTOP_1'])
return sns.barplot(y=name, x='value', hue='variable', data=df4,palette='coolwarm')
getcountdiff('event_attr1',OtherRX,'drugclass')
plotcountdiff('event_attr1',OtherRX,'Drug Class')
getcountdiff('event_attr6',OtherRX,'Drug Group')
plotcountdiff('event_attr6',OtherRX,'Drug Group')
FullyPaid = EDA_data[(EDA_data['event_descr']=='Fully Paid Claim')& (EDA_data['Days']<=0)]
FullyPaid.head()
getcountdiff('event_attr1',FullyPaid,'diagnosis')
plotcountdiff('event_attr1',FullyPaid,'Diagnosis')
getcountdiff('event_attr2',FullyPaid,'Treatment Place')
plotcountdiff('event_attr2',FullyPaid,'Treatment Place')
Call = EDA_data[(EDA_data['event_descr']=='Inbound Call by Mbr')|
(EDA_data['event_descr']=='Inbound Call by Prov')|
(EDA_data['event_descr']=='Inbound Call by Other')&
(EDA_data['Days']<=0)]
getcountdiff('event_attr2',Call,'Call Inquiry')
plotcountdiff('event_attr2',Call,'Call Inquiry')
getcountdiff('event_attr3',Call,'Diposition')
plotcountdiff('event_attr3',Call,'Diposition')
# filter out the data only related to RX Claim - Paid
rxClaimPaid = fullData[fullData.event_descr=='RX Claim - Paid'].copy()
rxClaimPaid.shape
opioid = rxClaimPaid[rxClaimPaid.PAY_DAY_SUPPLY_CNT >= 0].copy()
opioid.shape
len(opioid.id.unique())
def getDummies(col, data):
"""
Get dummy variables for specified column
"""
df = data[['id', col]].copy()
df = pd.get_dummies(df.set_index('id'), drop_first=True).groupby(['id']).max().reset_index()
return df
def getCount(col, data):
"""
Get count of events for each id
"""
df = data.groupby(['id', col])['event_descr'].count().reset_index()
df = df.pivot(index='id', columns=col, values='event_descr').reset_index().fillna(0)
return df
opioidDrugClass = getCount('event_attr1', opioid) # 1
opioidBrandName = getCount('event_attr5', opioid) # 2
opioidSpecialty = getCount('Specialty', opioid) # 3
print(opioidDrugClass.shape)
opioidDrugClass.head(3)
print(opioidBrandName.shape)
opioidBrandName.head(3)
print(opioidSpecialty.shape)
opioidSpecialty.head(3)
opioidCount = opioid.groupby('id')['event_descr'].count().reset_index() # 4
opioidCount.columns = ['id', 'opioidCount']
print(opioidCount.shape)
opioidCount.head(3)
opioidDaysSupplied = opioid.groupby('id')['PAY_DAY_SUPPLY_CNT'].sum().reset_index() # 5
opioidDaysSupplied.columns = ['id', 'opioidDaysSupplied']
print(opioidDaysSupplied.shape)
opioidDaysSupplied.head(3)
opioidDay0Specialty = getDummies('Specialty', opioid[opioid.Days==0]) # 6
print(opioidDay0Specialty.shape)
opioidDay0Specialty.head(3)
opioidDay0Supplies = opioid[opioid.Days==0][['id','PAY_DAY_SUPPLY_CNT']].groupby('id').max().reset_index() #7
opioidDay0Supplies.columns = ['id', 'opioidDay0Supplies']
print(opioidDay0Supplies.shape)
opioidDay0Supplies.head(3)
fullData.event_descr.value_counts()
fullyPaidClaim = fullData[fullData.event_descr=='Fully Paid Claim'].copy()
fullyPaidClaim.shape
def getAmount(event):
"""
Get Count, ChargeAmount, NetPaidAmount and ResponsibleAmount
for specified event
"""
# get fully paid claim data
data = fullData[fullData.event_descr==event].copy()
# Convert amount to float
data['event_attr3'] = data['event_attr3'].apply(float)
data['event_attr4'] = data['event_attr4'].apply(float)
data['event_attr5'] = data['event_attr5'].apply(float)
# Group by sum of charged amount
groupbyData = data.groupby('id').agg({'event_attr3': ['count', 'sum'],
'event_attr4': 'sum',
'event_attr5': 'sum'})
groupbyData.columns = [
'Count', 'ChargeAmount', 'NetPaidAmount', 'ResponsibleAmount']
groupbyData = groupbyData.reset_index()
return groupbyData
claimAmount = getAmount('Fully Paid Claim') #8
print(claimAmount.shape)
claimAmount.head(3)
fullyPaidClaim = fullData[fullData.event_descr=='Fully Paid Claim'].copy()
fullyPaidClaimTrainLabel = trainDataLabel[trainDataLabel.event_descr=='Fully Paid Claim'].copy()
def getDifference(col, data, n):
df0 = data.groupby(['id', col, 'LTOP'])['event_descr'].count().reset_index().fillna(0)
df1 = df0.groupby([col, 'LTOP'])['event_descr'].mean().reset_index()
df1 = df1.pivot(index=col, columns='LTOP', values='event_descr').reset_index()
df1.columns = [col, 'LTOP_0', 'LTOP_1']
df1['diff'] = abs(df1.LTOP_0 - df1.LTOP_1)
df1 = df1.sort_values('diff', ascending=False)
differenceList = list(df1[col].head(n))
return df1, differenceList
def getPopular(col, data, n):
df = data.groupby([col])['id'].nunique().reset_index().sort_values('id', ascending=False)
popularList = list(df[col].head(n))
return df, popularList
def getImportant(col, data, n1, n2):
df1, differenceList = getDifference(col, data, n1)
df2, popularList = getPopular(col, data, n2)
importantList = []
for event in differenceList:
if event in popularList:
importantList.append(event)
return importantList
def getImportantCount(col, importantList, data):
return getCount(col, data[data[col].apply(lambda event: event in importantList)])
importantList = getImportant('event_attr2', fullyPaidClaimTrainLabel, 30, 30) #9
treatmentPlace = getImportantCount('event_attr2', importantList, fullyPaidClaim)
print(treatmentPlace.shape)
treatmentPlace.head(3)
# getDifference('event_attr1', fullyPaidClaimTrain, 30)[0]
# getPopular('event_attr1', fullyPaidClaimTrain, 30)[0]
# getImportant('event_attr1', fullyPaidClaimTrain, 1000, 1000)
importantList = getDifference('event_attr1', fullyPaidClaimTrainLabel, 100)[1]
diagnosisClaim = getImportantCount('event_attr1', importantList, fullyPaidClaim) #10
print(diagnosisClaim.shape)
diagnosisClaim.head(3)
rxClaimPaidLabel = rxClaimPaid.merge(label, on='id')
drugClass = getImportantCount('event_attr1', getImportant('event_attr1', rxClaimPaidLabel, 200, 200), rxClaimPaid) #11
print(drugClass.shape)
drugClass.head(3)
drugGroup = getCount('event_attr6', rxClaimPaid) #12
print(drugGroup.shape)
drugGroup.head()
rxClaimPaid.shape
def getRxCost():
"""
Get total RxCost and NetPaidAmount for Rx Claim Paid
"""
# get fully paid claim data
data = rxClaimPaid.copy()
# Convert amount to float
data['event_attr3'] = data['event_attr3'].apply(float)
data['event_attr4'] = data['event_attr4'].apply(float)
# Group by sum of charged amount
groupbyData = data.groupby('id').agg({'event_attr3': ['count', 'sum'],
'event_attr4': 'sum'})
groupbyData.columns = [
'Count', 'RxCost', 'NetPaidAmount']
groupbyData = groupbyData.reset_index()
return groupbyData
rxCost = getRxCost() #13
print(rxCost.shape)
rxCost.head(3)
inboundCall = fullData[fullData['event_descr'].apply(
lambda event: event in ['Inbound Call by Mbr',
'Inbound Call by Prov',
'Inbound Call by Other'])].copy()
inboundCallLabel = inboundCall.merge(label, on='id')
callCategory = getImportantCount('event_attr1',
getImportant('event_attr1', inboundCallLabel, 200, 200), inboundCall) # 14
print(callCategory.shape)
callCategory.head(3)
# getDifference('event_attr2', inboundCallLabel, 30)[0]
# getPopular('event_attr1', rxClaimPaidLabel, 30)[0]
# getImportant('event_attr2', inboundCallLabel, 100, 100)
# callCategory = getImportantDummies('event_attr1', getImportant('event_attr1', inboundCallLabel, 200, 200), inboundCall) # 14
callInquiry = getImportantCount('event_attr2',
getImportant('event_attr2', inboundCallLabel, 200, 200), inboundCall) # 15
print(callInquiry.shape)
callInquiry.head(3)
callDisposition = getImportantCount('event_attr3',
getImportant('event_attr3', inboundCallLabel, 200, 200), inboundCall) # 16
print(callDisposition.shape)
callDisposition.head(3)
inboundCall['state'] = inboundCall['event_attr5'].fillna(
'Na, Na').apply(lambda location: location.split(', ')[1])
callState = getDummies('state', inboundCall) # 17
print(callState.shape)
callState.head(3)
fullData.event_descr.value_counts()
surgeryAmount = getAmount('Surgery') # 18
print(surgeryAmount.shape)
surgeryAmount.head(3)
newCPDAmount = getAmount('New diagnosis - CPD') # 19
print(newCPDAmount.shape)
newCPDAmount.head(3)
newHyperAmount = getAmount('New diagnosis - Hypertension') # 20
print(newHyperAmount.shape)
newHyperAmount.head(3)
newTop5Amount = getAmount('New diagnosis - Top 5') # 21
print(newTop5Amount.shape)
newTop5Amount.head(3)
newCHFAmount = getAmount('New diagnosis - CHF') # 22
print(newCHFAmount.shape)
newCHFAmount.head(3)
newDiabAmount = getAmount('New diagnosis - Diabetes') # 23
print(newDiabAmount.shape)
newDiabAmount.head(3)
features = [opioidDrugClass, opioidBrandName, opioidSpecialty, opioidCount,
opioidDaysSupplied, opioidDay0Specialty, opioidDay0Supplies, claimAmount,
treatmentPlace, diagnosisClaim, drugClass, drugGroup,
rxCost, callCategory, callInquiry, callDisposition, callState,
surgeryAmount, newCPDAmount, newHyperAmount, newTop5Amount, newCHFAmount, newDiabAmount]
from functools import reduce
mergedDf = reduce(lambda left, right: pd.merge(
left, right, on=['id'], how='outer'), features)
# Impute missing value
mergedDf = mergedDf.fillna(0)
mergedDf.shape
# Split train data and test data
trainDf = mergedDf.merge(trainID, on='id')
testDf = mergedDf.merge(testID, on='id')
# Add label to train data
trainDf = trainDf.merge(label, how='outer', on='id').fillna(0)
# Save data
trainDf.to_csv('trainDf.csv', index=False)
testDf.to_csv('testDf.csv', index=False)
# import useful packages
from sklearn import datasets
from sklearn import metrics
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_selection import RFECV, RFE
import warnings
warnings.filterwarnings('ignore')
pd.options.display.max_columns = None
X = trainDf.drop(['LTOP','id'], axis=1)
target = trainDf['LTOP']
from sklearn.ensemble import ExtraTreesClassifier
model = ExtraTreesClassifier()
model.fit(X, target)
# visualize the important features
dset = pd.DataFrame()
dset['attr'] = X.columns
dset['importance'] = model.feature_importances_
dset = dset.sort_values(by='importance', ascending=False)
dplot = dset.head(10)
plt.figure(figsize=(16, 8))
plt.barh(y=dplot['attr'], width=dplot['importance'], color='cornflowerblue')
plt.title('Feature Importances', fontsize=20, fontweight='bold', pad=20)
plt.xlabel('Importance', fontsize=14, labelpad=20)
plt.show()
trainDf = pd.read_csv("/content/trainDf.csv")
testDf = pd.read_csv("/content/testDf.csv")
testID = pd.read_csv("/content/testID.csv")
print(trainDf.shape)
print(testDf.shape)
allData = pd.concat([trainDf.drop(['LTOP'], axis=1).set_index('id'), testDf.set_index('id')], axis=0)
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
allData = pd.DataFrame(scaler.fit_transform(allData), columns=allData.columns)
X = allData.iloc[:13997, ]
X_testDf = allData.iloc[13997:, ]
y = trainDf.LTOP
print(X.shape)
print(y.shape)
print(X_testDf.shape)
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=2019)
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, roc_curve, auc, roc_auc_score
from sklearn.model_selection import KFold, cross_val_score
def getAccuracy(model, threshold):
model.fit(X_train, y_train)
probs = model.predict_proba(X_test)[:, 1]
y_pred = np.where(probs >= threshold, 1, 0)
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\n")
print("Classification Report:\n",classification_report(y_test, y_pred))
return accuracy_score(y_test, y_pred)
n_folds = 10
def cv_ROC_AUC(model):
"""
Return the average AUC score.
"""
# Set KFold to shuffle data before the split
kf = KFold(n_folds, shuffle=True, random_state=42)
# Get accuracy score
roc_auc = cross_val_score(model, X, y, scoring="roc_auc", cv=kf)
return roc_auc.mean()
def get_ROC_AUC(model):
"""
Return AUC score
"""
model.fit(X_train, y_train)
probs = model.predict_proba(X_test)[:,1]
roc_auc = roc_auc_score(y_test, probs)
return roc_auc
def plotROC(model):
"""
1. Plot ROC AUC
2. Return the best threshold
"""
model.fit(X_train, y_train)
probs = model.predict_proba(X_test)
preds = probs[:,1]
fpr, tpr, threshold = roc_curve(y_test, preds)
roc_auc = auc(fpr, tpr)
# Plot ROC AUC
plt.title('Receiver Operating Characteristic')
plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()
# Find optimal threshold
rocDf = pd.DataFrame({'fpr': fpr, 'tpr':tpr, 'threshold':threshold})
rocDf['tpr - fpr'] = rocDf.tpr - rocDf.fpr
optimalThreshold = rocDf.threshold[rocDf['tpr - fpr'].idxmax()]
return optimalThreshold
def evaluate(model):
optimalThreshold = plotROC(model)
print("Optimal Threshold: ", optimalThreshold)
print("\n")
print("Accuracy: ", getAccuracy(model, optimalThreshold))
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(n_estimators=1000)
print("AUC score: ", get_ROC_AUC(rfc))
evaluate(rfc)
from sklearn.linear_model import LogisticRegression
logit = LogisticRegression(penalty='l1', C=0.5, max_iter=1000)
print("AUC score: ", get_ROC_AUC(logit))
evaluate(logit)
from lightgbm import LGBMClassifier
# Model tuning
paraList = [500, 1000, 1500]
roc_auc_list = []
for i in paraList:
lgb = LGBMClassifier(objective='binary',
learning_rate=0.049,
n_estimators=i, # 1462
num_leaves=8,
min_data_in_leaf=4,
max_depth=3,
max_bin=41,
bagging_fraction=0.845,
bagging_freq=5,
feature_fraction=0.24,
feature_fraction_seed=9,
bagging_seed=9,
min_sum_hessian_in_leaf=11)
roc_auc = get_ROC_AUC(lgb)
roc_auc_list.append(roc_auc)
print("Parameter: ", i, "\tAUC score: ", roc_auc)
plt.plot(np.array(paraList),
np.array(roc_auc_list))
# Best model chosen after tuning
lgb = LGBMClassifier(objective='binary',
learning_rate=0.03,
n_estimators=1130,
num_leaves=8,
min_data_in_leaf=1,
max_depth=10,
max_bin=80,
bagging_fraction=0.8,
bagging_freq=5,
feature_fraction=0.24,
feature_fraction_seed=9,
bagging_seed=9,
min_sum_hessian_in_leaf=12)
cv_ROC_AUC(lgb)
evaluate(lgb)
# Use test data to make predictions
lgb.fit(X, y)
predictions = lgb.predict_proba(X_testDf)[:, 1]
output = pd.DataFrame({'ID': testDf.id,
'SCORE': predictions})
testID.columns = ['ID']
output = output.merge(testID, on='ID', how='outer').fillna(0)
output['RANK'] = output.SCORE.rank(method='min', ascending=False)
output = output.sort_values('SCORE', ascending=False)
output.head(10)
pd.Series(np.where(output.SCORE >= 0.5, 1, 0)).value_counts()
y.value_counts()