import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
raw = pd.read_csv("/content/drive/My Drive/Home/bibina Google play/googleplaystore.csv")
raw.head()
raw.info()
data = raw.dropna()
data.groupby('Category').mean().sort_values(by='Rating',ascending=False).head(5)
plt.figure(figsize=(16,8))
fig = sns.boxplot(x="Category",y="Rating",data=data,palette = "Set1")
fig.set_xticklabels(fig.get_xticklabels(),rotation=90)
plt.title('Distribution of Ratings in Each Category',fontsize = 20)
pd.to_numeric(data.Reviews)
plt.figure(figsize=(10,5))
sns.regplot(x=pd.to_numeric(data.Reviews),y=data.Rating,data=data)
plt.title('Rating vs Reviews',fontsize = 20)
data.Size.unique()
data.Size.replace({'Varies with device':np.nan},regex=True,inplace=True)
data.Size = (data.Size.replace(r'[kM]+$', '', regex=True).astype(float) * \
data.Size.str.extract(r'[\d\.]+([KM]+)', expand=False)
.fillna(1)
.replace(['k','M'], [1, 1000]).astype(int))
plt.figure(figsize=(10,5))
sns.regplot(x=pd.to_numeric(data.Size),y=data.Rating,data=data)
plt.title('Rating vs Size',fontsize = 20)
data.Installs.unique()
ins = data.groupby('Installs').mean()
plt.figure(figsize=(16,8))
fig = sns.barplot(x=ins.index,y=ins['Rating'],data=data,palette = "Set1")
fig.set_xticklabels(fig.get_xticklabels(),rotation=60)
plt.title('Ratings vs Install Times',fontsize = 20)
data.Price = data.Price.replace(r'[$]+', '', regex=True).astype(float)
plt.figure(figsize=(10,5))
sns.regplot(x=data.Price,y=data.Rating)
plt.title('Rating vs Price',fontsize = 20)
plt.figure(figsize=(10,5))
sns.regplot(x='Price',y='Rating',data=data[data.Price<=50])
plt.title('Rating vs Price(less than $50)',fontsize = 20)
fivestar = data[(data.Price==0)&(data.Rating==5.0)]
fivestar.shape
plt.figure(figsize=(16,8))
fig = sns.countplot(x='Category', data=fivestar,palette='Set1')
fig.set_xticklabels(fig.get_xticklabels(),rotation=60)
plt.title('5 Star Free Apps in Each Category ',fontsize = 20)
From previous analysis, I found that around 1500 ratings were missing. So, next, I'm going to use other complete rows of information to predict those missing ratings.
Now, it's time to choose useful features for predicting models. I will drop some columns including:
raw.drop(['App','Last Updated','Current Ver','Type','Genres'],axis=1,inplace=True)
raw = raw[raw.Size!='1,000+']
raw=raw[raw.Size!='Varies with device']
raw.Size = (raw.Size.replace(r'[kM]+$', '', regex=True).astype(float) * \
raw.Size.str.extract(r'[\d\.]+([KM]+)', expand=False)
.fillna(1)
.replace(['k','M'], [1, 1000]).astype(int))
raw.Reviews = pd.to_numeric(raw.Reviews)
num_col = [cname for cname in raw.columns if raw[cname].dtype in ['int64','float64']]
cat_col = [cname for cname in raw.columns if raw[cname].dtype=='object']
cat_col
num_col
raw.head()
new = pd.get_dummies(raw, prefix=cat_col, drop_first=True)
new
#filter out the test set
test = new[new.Rating.isna()]
label = new.Rating.dropna()
label
data = new.dropna().drop('Rating',axis=1)
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X = scaler.fit_transform(data)
X_train, X_test, y_train, y_test = train_test_split( X, label, test_size=0.2, random_state=101)
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error
def getAccuracy_cv(model):
model.fit(X_train,y_train)
# get predictions
preds = model.predict(X_test)
# cross validation
from sklearn.model_selection import cross_val_score
scores = -1 * cross_val_score(model,X,label,cv=10,scoring = 'neg_mean_absolute_error')
print(scores.mean())
# Hyperparameter Tuning
paraList = [500, 1000, 1500]
for i in paraList:
model =XGBRegressor(
learning_rate =0.1,
n_estimators=i,
max_depth=5,
min_child_weight=1,
gamma=0,
subsample=0.8,
colsample_bytree=0.8,
nthread=4,
scale_pos_weight=1,
seed=27)
getAccuracy_cv(model)
final_model =XGBRegressor(
learning_rate =0.1,
n_estimators=500,
max_depth=5,
min_child_weight=1,
gamma=0,
subsample=0.8,
colsample_bytree=0.8,
nthread=4,
scale_pos_weight=1,
seed=27)
getAccuracy_cv(final_model)
test = test.drop('Rating',axis=1)
test.head()