Housing Price Competition

Ames Housing dataset image

Load data

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
In [3]:
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')
In [4]:
train_data.shape
Out[4]:
(1460, 81)
In [14]:
test_data.shape
Out[14]:
(1459, 80)
In [17]:
testID = pd.DataFrame({'id':test_data.Id})
trainID = pd.DataFrame({'id':train_data.Id})
In [12]:
full_data = pd.concat([train_data.copy().drop('SalePrice',axis=1),test_data])

EDA

In [56]:
train_data.head()
Out[56]:
Id MSSubClass MSZoning LotFrontage LotArea Street Alley LotShape LandContour Utilities ... PoolArea PoolQC Fence MiscFeature MiscVal MoSold YrSold SaleType SaleCondition SalePrice
0 1 60 RL 65.0 8450 Pave NaN Reg Lvl AllPub ... 0 NaN NaN NaN 0 2 2008 WD Normal 208500
1 2 20 RL 80.0 9600 Pave NaN Reg Lvl AllPub ... 0 NaN NaN NaN 0 5 2007 WD Normal 181500
2 3 60 RL 68.0 11250 Pave NaN IR1 Lvl AllPub ... 0 NaN NaN NaN 0 9 2008 WD Normal 223500
3 4 70 RL 60.0 9550 Pave NaN IR1 Lvl AllPub ... 0 NaN NaN NaN 0 2 2006 WD Abnorml 140000
4 5 60 RL 84.0 14260 Pave NaN IR1 Lvl AllPub ... 0 NaN NaN NaN 0 12 2008 WD Normal 250000

5 rows × 81 columns

1. Categorical variables

In [23]:
categorical_cols = [cname for cname in train_data.columns if
                   train_data[cname].dtype == 'object']
In [25]:
len(categorical_cols)
Out[25]:
43
In [26]:
categorical_cols
Out[26]:
['MSZoning',
 'Street',
 'Alley',
 'LotShape',
 'LandContour',
 'Utilities',
 'LotConfig',
 'LandSlope',
 'Neighborhood',
 'Condition1',
 'Condition2',
 'BldgType',
 'HouseStyle',
 'RoofStyle',
 'RoofMatl',
 'Exterior1st',
 'Exterior2nd',
 'MasVnrType',
 'ExterQual',
 'ExterCond',
 'Foundation',
 'BsmtQual',
 'BsmtCond',
 'BsmtExposure',
 'BsmtFinType1',
 'BsmtFinType2',
 'Heating',
 'HeatingQC',
 'CentralAir',
 'Electrical',
 'KitchenQual',
 'Functional',
 'FireplaceQu',
 'GarageType',
 'GarageFinish',
 'GarageQual',
 'GarageCond',
 'PavedDrive',
 'PoolQC',
 'Fence',
 'MiscFeature',
 'SaleType',
 'SaleCondition']
In [31]:
# create functions of categorical plots
def bplot(x,a,b,c):
    fig = plt.figure(figsize=(a,b))
    plt.xticks(rotation=c)
    return sns.barplot(x=x,y='SalePrice',data=train_data)
def bxplot(x,a,b,c):
    fig = plt.figure(figsize=(a,b))
    plt.xticks(rotation=c)
    return sns.boxplot(x=x,y='SalePrice',data=train_data)
def vplot(x,a,b,c):
    fig = plt.figure(figsize=(a,b))
    plt.xticks(rotation=c)
    return sns.violinplot(x=x,y='SalePrice',data=train_data)
def cplot(x,a,b,c):
    fig = plt.figure(figsize=(a,b))
    plt.xticks(rotation=c)
    return sns.countplot(x=x,data=train_data)
In [34]:
bplot('SaleType',10,4,0)
Out[34]:
<matplotlib.axes._subplots.AxesSubplot at 0x1247b3898>
In [35]:
bxplot('Neighborhood',12,4,40)
Out[35]:
<matplotlib.axes._subplots.AxesSubplot at 0x1248ad438>
In [36]:
vplot('HouseStyle',8,4,0)
Out[36]:
<matplotlib.axes._subplots.AxesSubplot at 0x124bbfb70>
In [38]:
 cplot('GarageType',10,4,0)
Out[38]:
<matplotlib.axes._subplots.AxesSubplot at 0x126d95208>

2. Numerical variables

In [68]:
numerical_cols = [cname for cname in train_data.columns if
                   train_data[cname].dtype in ['int64','float64']]
In [69]:
numerical_cols.remove('SalePrice')
numerical_cols.remove('Id')
In [70]:
len(numerical_cols)
Out[70]:
36
In [71]:
correlation = train_data[numerical_cols].corr()
fig = plt.figure(figsize=(12,12))
sns.heatmap(correlation,cmap='magma')
Out[71]:
<matplotlib.axes._subplots.AxesSubplot at 0x14a70e6d8>
In [81]:
f = plt.figure(figsize=(12,20))

for i in range(len(numerical_cols)):
    f.add_subplot(9, 4, i+1)
    sns.scatterplot(train_data[numerical_cols].iloc[:,i], train_data.SalePrice, color='purple')
    
plt.tight_layout()

3. Target variable

In [83]:
# distribution of the overall house prices
sns.distplot(train_data['SalePrice'],color='purple')
Out[83]:
<matplotlib.axes._subplots.AxesSubplot at 0x153f92cf8>
  • The overall housing prices are distributed from 50 to 500 thousand, most of them are around 200 thousand

Data Preprocessing

In [73]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
In [74]:
X = train_data.drop(['Id','SalePrice'],axis=1)
y = train_data.SalePrice
In [75]:
from sklearn.model_selection import train_test_split
In [76]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.3, random_state=101)
  • Classify different kinds of variables
In [91]:
# select categorical columns 
categorical_cols = [cname for cname in X_train.columns if
                   X_train[cname].dtype == 'object']
# select numerical columns
numerical_cols = [cname for cname in X_train.columns if
                 X_train[cname].dtype in['int64','float64']]
# keep selected columns only
my_cols = categorical_cols + numerical_cols
X_train = X_train[my_cols].copy()
X_valid = X_valid[my_cols].copy()
  • Create a pipeline for preprocessing
In [78]:
# deal with missing values for numerical data
numerical_transformer = SimpleImputer(strategy = 'mean')
# deal with missing values and get categorical variables for categorible data
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy = 'most_frequent')),
    ('oneshot', OneHotEncoder(handle_unknown='ignore'))
])
# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
transformers=[
    ('num',numerical_transformer,numerical_cols),
    ('cat',categorical_transformer,categorical_cols)
])

Modeling

In [79]:
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error
import warnings
warnings.filterwarnings('ignore')
pd.options.display.max_columns = None
  • Random Forest
In [80]:
# define the model
model1 = RandomForestRegressor(n_estimators=1000,random_state=0)
# bundle preprocessing and model in a pipeline
clf = Pipeline(steps=[('preprocessor',preprocessor),
                     ('model',model1)
                     ])
# fit the model
clf.fit(X_train,y_train)
# get predictions
preds = clf.predict(X_valid)
# cross validation
from sklearn.model_selection import cross_val_score
scores = -1 * cross_val_score(clf,X,y,cv=5,scoring = 'neg_mean_absolute_error')
print(scores.mean())
17530.5413260274
  • XGBoost
In [89]:
# define the model
model2 =XGBRegressor( 
 learning_rate =0.1,
 n_estimators=1000,
 max_depth=5,
 min_child_weight=1,
 gamma=0,
 subsample=0.8,
 colsample_bytree=0.8,
 nthread=4,
 scale_pos_weight=1,
 seed=27)
# bundle preprocessing and model in a pipeline
clf = Pipeline(steps=[('preprocessor',preprocessor),
                     ('model',model2)
                     ])
# fit the model
clf.fit(X_train,y_train)
# get predictions
preds = clf.predict(X_valid)
# cross validation
from sklearn.model_selection import cross_val_score
scores = -1 * cross_val_score(clf,X,y,cv=10,scoring = 'neg_mean_absolute_error')
print(scores.mean())
[14:38:29] WARNING: src/objective/regression_obj.cu:152: reg:linear is now deprecated in favor of reg:squarederror.
[14:38:34] WARNING: src/objective/regression_obj.cu:152: reg:linear is now deprecated in favor of reg:squarederror.
[14:38:40] WARNING: src/objective/regression_obj.cu:152: reg:linear is now deprecated in favor of reg:squarederror.
[14:38:46] WARNING: src/objective/regression_obj.cu:152: reg:linear is now deprecated in favor of reg:squarederror.
[14:38:52] WARNING: src/objective/regression_obj.cu:152: reg:linear is now deprecated in favor of reg:squarederror.
[14:38:57] WARNING: src/objective/regression_obj.cu:152: reg:linear is now deprecated in favor of reg:squarederror.
[14:39:03] WARNING: src/objective/regression_obj.cu:152: reg:linear is now deprecated in favor of reg:squarederror.
[14:39:08] WARNING: src/objective/regression_obj.cu:152: reg:linear is now deprecated in favor of reg:squarederror.
[14:39:15] WARNING: src/objective/regression_obj.cu:152: reg:linear is now deprecated in favor of reg:squarederror.
[14:39:21] WARNING: src/objective/regression_obj.cu:152: reg:linear is now deprecated in favor of reg:squarederror.
[14:39:27] WARNING: src/objective/regression_obj.cu:152: reg:linear is now deprecated in favor of reg:squarederror.
15590.204938998286

Predict

In [96]:
test_data = test_data.drop('Id',axis=1)
In [106]:
test_pred = clf.predict(test_data)
test_pred
Out[106]:
array([122502.04 , 146333.83 , 176828.17 , ..., 173329.17 , 117575.914,
       225758.78 ], dtype=float32)
In [117]:
sp = pd.DataFrame(test_pred,columns=['SalePrice'])
In [120]:
output = pd.concat([testID,sp],axis=1)
In [121]:
output.to_csv('submission.csv', index=False)