Boston Housing – Data Science

Latihan Data Science untuk studi kasus Perumahan (Housing) di kota Boston dengan menggunakan Jupyter. 

In [1]:
import numpy as np
import pandas as pd
import os 

import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats
In [2]:
#df --> create dataframe pandas
df = pd.read_csv("/home/abdusy/Documents/Data_Science/ML_AI/HousingData.csv")



In [3]:
#Dimension of the dataset
print(np.shape(df))
(506, 14)
In [9]:
# Let's summarize the data to see the distribution of data
print(df.describe())
             CRIM          ZN       INDUS        CHAS         NOX          RM  \
count  486.000000  486.000000  486.000000  486.000000  506.000000  506.000000   
mean     3.611874   11.211934   11.083992    0.069959    0.554695    6.284634   
std      8.720192   23.388876    6.835896    0.255340    0.115878    0.702617   
min      0.006320    0.000000    0.460000    0.000000    0.385000    3.561000   
25%      0.081900    0.000000    5.190000    0.000000    0.449000    5.885500   
50%      0.253715    0.000000    9.690000    0.000000    0.538000    6.208500   
75%      3.560262   12.500000   18.100000    0.000000    0.624000    6.623500   
max     88.976200  100.000000   27.740000    1.000000    0.871000    8.780000   

              AGE         DIS         RAD         TAX     PTRATIO           B  \
count  486.000000  506.000000  506.000000  506.000000  506.000000  506.000000   
mean    68.518519    3.795043    9.549407  408.237154   18.455534  356.674032   
std     27.999513    2.105710    8.707259  168.537116    2.164946   91.294864   
min      2.900000    1.129600    1.000000  187.000000   12.600000    0.320000   
25%     45.175000    2.100175    4.000000  279.000000   17.400000  375.377500   
50%     76.800000    3.207450    5.000000  330.000000   19.050000  391.440000   
75%     93.975000    5.188425   24.000000  666.000000   20.200000  396.225000   
max    100.000000   12.126500   24.000000  711.000000   22.000000  396.900000   

            LSTAT        MEDV  
count  486.000000  506.000000  
mean    12.715432   22.532806  
std      7.155871    9.197104  
min      1.730000    5.000000  
25%      7.125000   17.025000  
50%     11.430000   21.200000  
75%     16.955000   25.000000  
max     37.970000   50.000000  
In [8]:
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats

fig, axs = plt.subplots(ncols=7, nrows=2, figsize=(20, 10))
index = 0
axs = axs.flatten()
for k,v in df.items():
    sns.boxplot(y=k, df=df, ax=axs[index])
    index += 1
plt.tight_layout(pad=0.4, w_pad=0.5, h_pad=5.0)
---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-8-44e62316f35b> in <module>
      7 axs = axs.flatten()
      8 for k,v in df.items():
----> 9     sns.boxplot(y=k, df=df, ax=axs[index])
     10     index += 1
     11 plt.tight_layout(pad=0.4, w_pad=0.5, h_pad=5.0)

~/anaconda3/lib/python3.7/site-packages/seaborn/categorical.py in boxplot(x, y, hue, data, order, hue_order, orient, color, palette, saturation, width, dodge, fliersize, linewidth, whis, notch, ax, **kwargs)
   2229     plotter = _BoxPlotter(x, y, hue, data, order, hue_order,
   2230                           orient, color, palette, saturation,
-> 2231                           width, dodge, fliersize, linewidth)
   2232 
   2233     if ax is None:

~/anaconda3/lib/python3.7/site-packages/seaborn/categorical.py in __init__(self, x, y, hue, data, order, hue_order, orient, color, palette, saturation, width, dodge, fliersize, linewidth)
    444                  width, dodge, fliersize, linewidth):
    445 
--> 446         self.establish_variables(x, y, hue, data, orient, order, hue_order)
    447         self.establish_colors(color, palette, saturation)
    448 

~/anaconda3/lib/python3.7/site-packages/seaborn/categorical.py in establish_variables(self, x, y, hue, data, orient, order, hue_order, units)
    153                 if isinstance(input, string_types):
    154                     err = "Could not interpret input '{}'".format(input)
--> 155                     raise ValueError(err)
    156 
    157             # Figure out the plotting orientation

ValueError: Could not interpret input 'CRIM'
In [5]:
#delete frame in brutal way. No confirmation
df.dropna()
Out[5]:
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}.dataframe tbody tr th {
vertical-align: top;
}

.dataframe thead th {
text-align: right;
}

CRIM ZN INDUS CHAS NOX RM AGE DIS RAD TAX PTRATIO B LSTAT MEDV
0 0.00632 18.0 2.31 0.0 0.538 6.575 65.2 4.0900 1 296 15.3 396.90 4.98 24.0
1 0.02731 0.0 7.07 0.0 0.469 6.421 78.9 4.9671 2 242 17.8 396.90 9.14 21.6
2 0.02729 0.0 7.07 0.0 0.469 7.185 61.1 4.9671 2 242 17.8 392.83 4.03 34.7
3 0.03237 0.0 2.18 0.0 0.458 6.998 45.8 6.0622 3 222 18.7 394.63 2.94 33.4
5 0.02985 0.0 2.18 0.0 0.458 6.430 58.7 6.0622 3 222 18.7 394.12 5.21 28.7
499 0.17783 0.0 9.69 0.0 0.585 5.569 73.5 2.3999 6 391 19.2 395.77 15.10 17.5
500 0.22438 0.0 9.69 0.0 0.585 6.027 79.7 2.4982 6 391 19.2 396.90 14.33 16.8
502 0.04527 0.0 11.93 0.0 0.573 6.120 76.7 2.2875 1 273 21.0 396.90 9.08 20.6
503 0.06076 0.0 11.93 0.0 0.573 6.976 91.0 2.1675 1 273 21.0 396.90 5.64 23.9
504 0.10959 0.0 11.93 0.0 0.573 6.794 89.3 2.3889 1 273 21.0 393.45 6.48 22.0

394 rows × 14 columns

In [36]:
#pour avoir la shape de donnee
df.shape
Out[36]:
(394, 14)
In [24]:
df2 = df.dropna()
print(df.shape)
print(df2.shape)
(506, 14)
(394, 14)
In [38]:
df.dropna(inplace=True)
print(df.shape)
(394, 14)
In [29]:
df.plot(figsize=(10,10))
Out[29]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f279a02b210>
In [32]:
df["ZN"].plot(figsize=(10,10))
Out[32]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f2799eb0890>
In [39]:
df["ZN"].hist(bins=100)
Out[39]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f2799de9f10>
In [40]:
df.loc[1, "RM"]
Out[40]:
6.421
In [41]:
df.loc[2, "TAX"]
Out[41]:
242
In [42]:
#Je veux une seule valeur celle de la ligne 2 colloone "TAX"
df.iloc[2, 9]
Out[42]:
242
In [43]:
#Je veux toute la ligne 2
df.iloc[2, :]
Out[43]:
CRIM         0.02729
ZN           0.00000
INDUS        7.07000
CHAS         0.00000
NOX          0.46900
RM           7.18500
AGE         61.10000
DIS          4.96710
RAD          2.00000
TAX        242.00000
PTRATIO     17.80000
B          392.83000
LSTAT        4.03000
MEDV        34.70000
Name: 2, dtype: float64
In [44]:
#
df["TAX"]
Out[44]:
0      296
1      242
2      242
3      222
5      222
      ... 
499    391
500    391
502    273
503    273
504    273
Name: TAX, Length: 394, dtype: int64
In [45]:
df.sum()
Out[45]:
CRIM         1453.91365
ZN           4515.50000
INDUS        4334.34000
CHAS           27.00000
NOX           217.96690
RM           2474.32600
AGE         27159.50000
DIS          1499.27560
RAD          3705.00000
TAX        160134.00000
PTRATIO      7303.80000
B          141245.43000
LSTAT        5031.03000
MEDV         8809.70000
dtype: float64
In [46]:
df[ ["CRIM", "B"] ]
Out[46]:
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}.dataframe tbody tr th {
vertical-align: top;
}

.dataframe thead th {
text-align: right;
}

CRIM B
0 0.00632 396.90
1 0.02731 396.90
2 0.02729 392.83
3 0.03237 394.63
5 0.02985 394.12
499 0.17783 395.77
500 0.22438 396.90
502 0.04527 396.90
503 0.06076 396.90
504 0.10959 393.45

394 rows × 2 columns

In [48]:
df[ ["CRIM", "B"] ].sum()
Out[48]:
CRIM      1453.91365
B       141245.43000
dtype: float64
In [49]:
np.sum(df[ ["CRIM", "B"]])
Out[49]:
CRIM      1453.91365
B       141245.43000
dtype: float64
In [47]:
#To see the statistics
df.describe()
Out[47]:
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}.dataframe tbody tr th {
vertical-align: top;
}

.dataframe thead th {
text-align: right;
}

CRIM ZN INDUS CHAS NOX RM AGE DIS RAD TAX PTRATIO B LSTAT
count 486.000000 486.000000 486.000000 486.000000 506.000000 506.000000 486.000000 506.000000 506.000000 506.000000 506.000000 506.000000 486.000000
mean 3.611874 11.211934 11.083992 0.069959 0.554695 6.284634 68.518519 3.795043 9.549407 408.237154 18.455534 356.674032 12.715432
std 8.720192 23.388876 6.835896 0.255340 0.115878 0.702617 27.999513 2.105710 8.707259 168.537116 2.164946 91.294864 7.155871
min 0.006320 0.000000 0.460000 0.000000 0.385000 3.561000 2.900000 1.129600 1.000000 187.000000 12.600000 0.320000 1.730000
25% 0.081900 0.000000 5.190000 0.000000 0.449000 5.885500 45.175000 2.100175 4.000000 279.000000 17.400000 375.377500 7.125000
50% 0.253715 0.000000 9.690000 0.000000 0.538000 6.208500 76.800000 3.207450 5.000000 330.000000 19.050000 391.440000 11.430000
75% 3.560262 12.500000 18.100000 0.000000 0.624000 6.623500 93.975000 5.188425 24.000000 666.000000 20.200000 396.225000 16.955000
max 88.976200 100.000000 27.740000 1.000000 0.871000 8.780000 100.000000 12.126500 24.000000 711.000000 22.000000 396.900000 37.970000
In [54]:
df.describe().plot()
Out[54]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f2798a44610>
In [11]:
~df.isna()
Out[11]:
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}.dataframe tbody tr th {
vertical-align: top;
}

.dataframe thead th {
text-align: right;
}

CRIM ZN INDUS CHAS NOX RM AGE DIS RAD TAX PTRATIO B LSTAT MEDV
0 True True True True True True True True True True True True True True
1 True True True True True True True True True True True True True True
2 True True True True True True True True True True True True True True
3 True True True True True True True True True True True True True True
4 True True True True True True True True True True True True False True
501 True True True True True True True True True True True True False True
502 True True True True True True True True True True True True True True
503 True True True True True True True True True True True True True True
504 True True True True True True True True True True True True True True
505 True True True True True True False True True True True True True True

506 rows × 14 columns

In [13]:
#taken from : https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html
from sklearn.model_selection import train_test_split
In [15]:
print(train_test_split.__doc__)
Split arrays or matrices into random train and test subsets

    Quick utility that wraps input validation and
    ``next(ShuffleSplit().split(X, y))`` and application to input data
    into a single call for splitting (and optionally subsampling) data in a
    oneliner.

    Read more in the :ref:`User Guide <cross_validation>`.

    Parameters
    ----------
    *arrays : sequence of indexables with same length / shape[0]
        Allowed inputs are lists, numpy arrays, scipy-sparse
        matrices or pandas dataframes.

    test_size : float, int or None, optional (default=None)
        If float, should be between 0.0 and 1.0 and represent the proportion
        of the dataset to include in the test split. If int, represents the
        absolute number of test samples. If None, the value is set to the
        complement of the train size. If ``train_size`` is also None, it will
        be set to 0.25.

    train_size : float, int, or None, (default=None)
        If float, should be between 0.0 and 1.0 and represent the
        proportion of the dataset to include in the train split. If
        int, represents the absolute number of train samples. If None,
        the value is automatically set to the complement of the test size.

    random_state : int, RandomState instance or None, optional (default=None)
        If int, random_state is the seed used by the random number generator;
        If RandomState instance, random_state is the random number generator;
        If None, the random number generator is the RandomState instance used
        by `np.random`.

    shuffle : boolean, optional (default=True)
        Whether or not to shuffle the data before splitting. If shuffle=False
        then stratify must be None.

    stratify : array-like or None (default=None)
        If not None, data is split in a stratified fashion, using this as
        the class labels.

    Returns
    -------
    splitting : list, length=2 * len(arrays)
        List containing train-test split of inputs.

        .. versionadded:: 0.16
            If the input is sparse, the output will be a
            ``scipy.sparse.csr_matrix``. Else, output type is the same as the
            input type.

    Examples
    --------
    >>> import numpy as np
    >>> from sklearn.model_selection import train_test_split
    >>> X, y = np.arange(10).reshape((5, 2)), range(5)
    >>> X
    array([[0, 1],
           [2, 3],
           [4, 5],
           [6, 7],
           [8, 9]])
    >>> list(y)
    [0, 1, 2, 3, 4]

    >>> X_train, X_test, y_train, y_test = train_test_split(
    ...     X, y, test_size=0.33, random_state=42)
    ...
    >>> X_train
    array([[4, 5],
           [0, 1],
           [6, 7]])
    >>> y_train
    [2, 0, 3]
    >>> X_test
    array([[2, 3],
           [8, 9]])
    >>> y_test
    [1, 4]

    >>> train_test_split(y, shuffle=False)
    [[0, 1, 2], [3, 4]]

    
In [50]:
#show all columns name
df.columns
Out[50]:
Index(['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX',
       'PTRATIO', 'B', 'LSTAT', 'MEDV'],
      dtype='object')
In [31]:
df.head()
Out[31]:
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}.dataframe tbody tr th {
vertical-align: top;
}

.dataframe thead th {
text-align: right;
}

CRIM ZN INDUS CHAS NOX RM AGE DIS RAD TAX PTRATIO B LSTAT MEDV
0 0.00632 18.0 2.31 0.0 0.538 6.575 65.2 4.0900 1 296 15.3 396.90 4.98 24.0
1 0.02731 0.0 7.07 0.0 0.469 6.421 78.9 4.9671 2 242 17.8 396.90 9.14 21.6
2 0.02729 0.0 7.07 0.0 0.469 7.185 61.1 4.9671 2 242 17.8 392.83 4.03 34.7
3 0.03237 0.0 2.18 0.0 0.458 6.998 45.8 6.0622 3 222 18.7 394.63 2.94 33.4
4 0.06905 0.0 2.18 0.0 0.458 7.147 54.2 6.0622 3 222 18.7 396.90 NaN 36.2
In [54]:
#Slice dataframe (df) by moving column price (MEDV) to a new dataframe
dfy = df["MEDV"]
dfy.head()
Out[54]:
0    24.0
1    21.6
2    34.7
3    33.4
4    36.2
Name: MEDV, dtype: float64
In [55]:
#slice the rest of column into dfx 
dfx = df.copy()
dfx.head()
Out[55]:
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}.dataframe tbody tr th {
vertical-align: top;
}

.dataframe thead th {
text-align: right;
}

CRIM ZN INDUS CHAS NOX RM AGE DIS RAD TAX PTRATIO B LSTAT MEDV
0 0.00632 18.0 2.31 0.0 0.538 6.575 65.2 4.0900 1 296 15.3 396.90 4.98 24.0
1 0.02731 0.0 7.07 0.0 0.469 6.421 78.9 4.9671 2 242 17.8 396.90 9.14 21.6
2 0.02729 0.0 7.07 0.0 0.469 7.185 61.1 4.9671 2 242 17.8 392.83 4.03 34.7
3 0.03237 0.0 2.18 0.0 0.458 6.998 45.8 6.0622 3 222 18.7 394.63 2.94 33.4
4 0.06905 0.0 2.18 0.0 0.458 7.147 54.2 6.0622 3 222 18.7 396.90 NaN 36.2
In [40]:
X_train, X_test, y_train, y_test = train_test_split[df.iloc[:,:-1], df["MEDV"], train_size=.7]
  File "<ipython-input-40-e365dc93ba13>", line 1
    X_train, X_test, y_train, y_test = train_test_split[df.iloc[:,:-1], df["MEDV"], train_size=.7]
                                                                                              ^
SyntaxError: invalid syntax
In [6]:
from sklearn.linear_model import LinearRegression

reg = LinearRegression() #regle
reg.fit(X_train, y_train) #learning process

reg.predict(X_test) #Result of prediction
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-6-f788e936d6f5> in <module>
      2 
      3 reg = LinearRegression() #regle
----> 4 reg.fit(X_train, y_train) #learning process
      5 
      6 reg.predict(X_test) #Result of prediction

NameError: name 'X_train' is not defined
In [ ]:
 

 

 

Tags: , , , ,

Leave a Reply

Fill in your details below or click an icon to log in:

WordPress.com Logo

You are commenting using your WordPress.com account. Log Out /  Change )

Google photo

You are commenting using your Google account. Log Out /  Change )

Twitter picture

You are commenting using your Twitter account. Log Out /  Change )

Facebook photo

You are commenting using your Facebook account. Log Out /  Change )

Connecting to %s


%d bloggers like this: