Housing Price Competition¶

Ames Housing dataset image

Load data¶

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

train_data.shape

(1460, 81)

test_data.shape

(1459, 80)

testID = pd.DataFrame({'id':test_data.Id})
trainID = pd.DataFrame({'id':train_data.Id})

full_data = pd.concat([train_data.copy().drop('SalePrice',axis=1),test_data])

EDA¶

train_data.head()

1. Categorical variables¶

categorical_cols = [cname for cname in train_data.columns if
                   train_data[cname].dtype == 'object']

len(categorical_cols)

43

categorical_cols

['MSZoning',
 'Street',
 'Alley',
 'LotShape',
 'LandContour',
 'Utilities',
 'LotConfig',
 'LandSlope',
 'Neighborhood',
 'Condition1',
 'Condition2',
 'BldgType',
 'HouseStyle',
 'RoofStyle',
 'RoofMatl',
 'Exterior1st',
 'Exterior2nd',
 'MasVnrType',
 'ExterQual',
 'ExterCond',
 'Foundation',
 'BsmtQual',
 'BsmtCond',
 'BsmtExposure',
 'BsmtFinType1',
 'BsmtFinType2',
 'Heating',
 'HeatingQC',
 'CentralAir',
 'Electrical',
 'KitchenQual',
 'Functional',
 'FireplaceQu',
 'GarageType',
 'GarageFinish',
 'GarageQual',
 'GarageCond',
 'PavedDrive',
 'PoolQC',
 'Fence',
 'MiscFeature',
 'SaleType',
 'SaleCondition']

# create functions of categorical plots
def bplot(x,a,b,c):
    fig = plt.figure(figsize=(a,b))
    plt.xticks(rotation=c)
    return sns.barplot(x=x,y='SalePrice',data=train_data)
def bxplot(x,a,b,c):
    fig = plt.figure(figsize=(a,b))
    plt.xticks(rotation=c)
    return sns.boxplot(x=x,y='SalePrice',data=train_data)
def vplot(x,a,b,c):
    fig = plt.figure(figsize=(a,b))
    plt.xticks(rotation=c)
    return sns.violinplot(x=x,y='SalePrice',data=train_data)
def cplot(x,a,b,c):
    fig = plt.figure(figsize=(a,b))
    plt.xticks(rotation=c)
    return sns.countplot(x=x,data=train_data)

bplot('SaleType',10,4,0)

<matplotlib.axes._subplots.AxesSubplot at 0x1247b3898>

bxplot('Neighborhood',12,4,40)

<matplotlib.axes._subplots.AxesSubplot at 0x1248ad438>

vplot('HouseStyle',8,4,0)

<matplotlib.axes._subplots.AxesSubplot at 0x124bbfb70>

 cplot('GarageType',10,4,0)

<matplotlib.axes._subplots.AxesSubplot at 0x126d95208>

2. Numerical variables¶

numerical_cols = [cname for cname in train_data.columns if
                   train_data[cname].dtype in ['int64','float64']]

numerical_cols.remove('SalePrice')
numerical_cols.remove('Id')

len(numerical_cols)

36

correlation = train_data[numerical_cols].corr()
fig = plt.figure(figsize=(12,12))
sns.heatmap(correlation,cmap='magma')

<matplotlib.axes._subplots.AxesSubplot at 0x14a70e6d8>

f = plt.figure(figsize=(12,20))

for i in range(len(numerical_cols)):
    f.add_subplot(9, 4, i+1)
    sns.scatterplot(train_data[numerical_cols].iloc[:,i], train_data.SalePrice, color='purple')
    
plt.tight_layout()

3. Target variable¶

# distribution of the overall house prices
sns.distplot(train_data['SalePrice'],color='purple')

<matplotlib.axes._subplots.AxesSubplot at 0x153f92cf8>

The overall housing prices are distributed from 50 to 500 thousand, most of them are around 200 thousand

Data Preprocessing¶

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

X = train_data.drop(['Id','SalePrice'],axis=1)
y = train_data.SalePrice

from sklearn.model_selection import train_test_split

X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.3, random_state=101)

Classify different kinds of variables

# select categorical columns 
categorical_cols = [cname for cname in X_train.columns if
                   X_train[cname].dtype == 'object']
# select numerical columns
numerical_cols = [cname for cname in X_train.columns if
                 X_train[cname].dtype in['int64','float64']]
# keep selected columns only
my_cols = categorical_cols + numerical_cols
X_train = X_train[my_cols].copy()
X_valid = X_valid[my_cols].copy()

Create a pipeline for preprocessing

# deal with missing values for numerical data
numerical_transformer = SimpleImputer(strategy = 'mean')
# deal with missing values and get categorical variables for categorible data
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy = 'most_frequent')),
    ('oneshot', OneHotEncoder(handle_unknown='ignore'))
])
# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
transformers=[
    ('num',numerical_transformer,numerical_cols),
    ('cat',categorical_transformer,categorical_cols)
])

Modeling¶

from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error
import warnings
warnings.filterwarnings('ignore')
pd.options.display.max_columns = None

Random Forest

# define the model
model1 = RandomForestRegressor(n_estimators=1000,random_state=0)
# bundle preprocessing and model in a pipeline
clf = Pipeline(steps=[('preprocessor',preprocessor),
                     ('model',model1)
                     ])
# fit the model
clf.fit(X_train,y_train)
# get predictions
preds = clf.predict(X_valid)
# cross validation
from sklearn.model_selection import cross_val_score
scores = -1 * cross_val_score(clf,X,y,cv=5,scoring = 'neg_mean_absolute_error')
print(scores.mean())

17530.5413260274

XGBoost

# define the model
model2 =XGBRegressor( 
 learning_rate =0.1,
 n_estimators=1000,
 max_depth=5,
 min_child_weight=1,
 gamma=0,
 subsample=0.8,
 colsample_bytree=0.8,
 nthread=4,
 scale_pos_weight=1,
 seed=27)
# bundle preprocessing and model in a pipeline
clf = Pipeline(steps=[('preprocessor',preprocessor),
                     ('model',model2)
                     ])
# fit the model
clf.fit(X_train,y_train)
# get predictions
preds = clf.predict(X_valid)
# cross validation
from sklearn.model_selection import cross_val_score
scores = -1 * cross_val_score(clf,X,y,cv=10,scoring = 'neg_mean_absolute_error')
print(scores.mean())

[14:38:29] WARNING: src/objective/regression_obj.cu:152: reg:linear is now deprecated in favor of reg:squarederror.
[14:38:34] WARNING: src/objective/regression_obj.cu:152: reg:linear is now deprecated in favor of reg:squarederror.
[14:38:40] WARNING: src/objective/regression_obj.cu:152: reg:linear is now deprecated in favor of reg:squarederror.
[14:38:46] WARNING: src/objective/regression_obj.cu:152: reg:linear is now deprecated in favor of reg:squarederror.
[14:38:52] WARNING: src/objective/regression_obj.cu:152: reg:linear is now deprecated in favor of reg:squarederror.
[14:38:57] WARNING: src/objective/regression_obj.cu:152: reg:linear is now deprecated in favor of reg:squarederror.
[14:39:03] WARNING: src/objective/regression_obj.cu:152: reg:linear is now deprecated in favor of reg:squarederror.
[14:39:08] WARNING: src/objective/regression_obj.cu:152: reg:linear is now deprecated in favor of reg:squarederror.
[14:39:15] WARNING: src/objective/regression_obj.cu:152: reg:linear is now deprecated in favor of reg:squarederror.
[14:39:21] WARNING: src/objective/regression_obj.cu:152: reg:linear is now deprecated in favor of reg:squarederror.
[14:39:27] WARNING: src/objective/regression_obj.cu:152: reg:linear is now deprecated in favor of reg:squarederror.
15590.204938998286

Predict¶

test_data = test_data.drop('Id',axis=1)

test_pred = clf.predict(test_data)
test_pred

array([122502.04 , 146333.83 , 176828.17 , ..., 173329.17 , 117575.914,
       225758.78 ], dtype=float32)

sp = pd.DataFrame(test_pred,columns=['SalePrice'])

output = pd.concat([testID,sp],axis=1)

output.to_csv('submission.csv', index=False)

	Id	MSSubClass	MSZoning	LotFrontage	LotArea	Street	Alley	LotShape	LandContour	Utilities	...	PoolQC	Fence	MiscFeature	MoSold	YrSold	SaleType	SaleCondition	SalePrice
0	1	60	RL	65.0	8450	Pave	NaN	Reg	Lvl	AllPub	...	NaN	NaN	NaN	2	2008	WD	Normal	208500
1	2	20	RL	80.0	9600	Pave	NaN	Reg	Lvl	AllPub	...	NaN	NaN	NaN	5	2007	WD	Normal	181500
2	3	60	RL	68.0	11250	Pave	NaN	IR1	Lvl	AllPub	...	NaN	NaN	NaN	9	2008	WD	Normal	223500
3	4	70	RL	60.0	9550	Pave	NaN	IR1	Lvl	AllPub	...	NaN	NaN	NaN	2	2006	WD	Abnorml	140000
4	5	60	RL	84.0	14260	Pave	NaN	IR1	Lvl	AllPub	...	NaN	NaN	NaN	12	2008	WD	Normal	250000