In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session
/kaggle/input/red-wine-quality-cortez-et-al-2009/winequality-red.csv
In [2]:
import pandas as pd
import sklearn
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from scipy.stats import randint
from sklearn.preprocessing import StandardScaler
import numpy as np
import matplotlib.pyplot as plt
from sklearn.svm import LinearSVC

Gaining insights from the data

Lets open the dataset with pandas so we can better visualise

In [3]:
wine_csv = pd.read_csv("/kaggle/input/red-wine-quality-cortez-et-al-2009/winequality-red.csv")
wine_data = pd.DataFrame(wine_csv)
wine_data.head()
Out[3]:
fixed acidity volatile acidity citric acid residual sugar chlorides free sulfur dioxide total sulfur dioxide density pH sulphates alcohol quality
0 7.4 0.70 0.00 1.9 0.076 11.0 34.0 0.9978 3.51 0.56 9.4 5
1 7.8 0.88 0.00 2.6 0.098 25.0 67.0 0.9968 3.20 0.68 9.8 5
2 7.8 0.76 0.04 2.3 0.092 15.0 54.0 0.9970 3.26 0.65 9.8 5
3 11.2 0.28 0.56 1.9 0.075 17.0 60.0 0.9980 3.16 0.58 9.8 6
4 7.4 0.70 0.00 1.9 0.076 11.0 34.0 0.9978 3.51 0.56 9.4 5

Next, lets check if there are any problems with the dataset such as null values or different row counts.

In [4]:
wine_data.describe()
Out[4]:
fixed acidity volatile acidity citric acid residual sugar chlorides free sulfur dioxide total sulfur dioxide density pH sulphates alcohol quality
count 1599.000000 1599.000000 1599.000000 1599.000000 1599.000000 1599.000000 1599.000000 1599.000000 1599.000000 1599.000000 1599.000000 1599.000000
mean 8.319637 0.527821 0.270976 2.538806 0.087467 15.874922 46.467792 0.996747 3.311113 0.658149 10.422983 5.636023
std 1.741096 0.179060 0.194801 1.409928 0.047065 10.460157 32.895324 0.001887 0.154386 0.169507 1.065668 0.807569
min 4.600000 0.120000 0.000000 0.900000 0.012000 1.000000 6.000000 0.990070 2.740000 0.330000 8.400000 3.000000
25% 7.100000 0.390000 0.090000 1.900000 0.070000 7.000000 22.000000 0.995600 3.210000 0.550000 9.500000 5.000000
50% 7.900000 0.520000 0.260000 2.200000 0.079000 14.000000 38.000000 0.996750 3.310000 0.620000 10.200000 6.000000
75% 9.200000 0.640000 0.420000 2.600000 0.090000 21.000000 62.000000 0.997835 3.400000 0.730000 11.100000 6.000000
max 15.900000 1.580000 1.000000 15.500000 0.611000 72.000000 289.000000 1.003690 4.010000 2.000000 14.900000 8.000000
In [5]:
wine_data.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1599 entries, 0 to 1598
Data columns (total 12 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   fixed acidity         1599 non-null   float64
 1   volatile acidity      1599 non-null   float64
 2   citric acid           1599 non-null   float64
 3   residual sugar        1599 non-null   float64
 4   chlorides             1599 non-null   float64
 5   free sulfur dioxide   1599 non-null   float64
 6   total sulfur dioxide  1599 non-null   float64
 7   density               1599 non-null   float64
 8   pH                    1599 non-null   float64
 9   sulphates             1599 non-null   float64
 10  alcohol               1599 non-null   float64
 11  quality               1599 non-null   int64  
dtypes: float64(11), int64(1)
memory usage: 150.0 KB

There are no null values and each column have the same number of rows. We don't have to clean the data

Lets see how well each feature correlates with quality:

In [6]:
corr_matrix = wine_data.corr()
corr_matrix["quality"].sort_values(ascending=False)
Out[6]:
quality                 1.000000
alcohol                 0.476166
sulphates               0.251397
citric acid             0.226373
fixed acidity           0.124052
residual sugar          0.013732
free sulfur dioxide    -0.050656
pH                     -0.057731
chlorides              -0.128907
density                -0.174919
total sulfur dioxide   -0.185100
volatile acidity       -0.390558
Name: quality, dtype: float64

We will plot the data using a histogram to visualise the population inbalance at different quality levels:

In [7]:
wine_data["quality"].hist()
Out[7]:
<matplotlib.axes._subplots.AxesSubplot at 0x7fec6db83310>

Due to the population inbalance, we cannot use random sampling to split the dataset as this would lead to biases during training. To ensure that the samples are representative of the population, we shall use stratified sampling. Lets start preparing the data.

Preparing the data

In [8]:
wine_data_array = np.array(wine_data)
X = wine_data_array[:, :-1]
Y = wine_data_array[:, -1:]
Y = Y.ravel()
X, Y
Out[8]:
(array([[ 7.4  ,  0.7  ,  0.   , ...,  3.51 ,  0.56 ,  9.4  ],
        [ 7.8  ,  0.88 ,  0.   , ...,  3.2  ,  0.68 ,  9.8  ],
        [ 7.8  ,  0.76 ,  0.04 , ...,  3.26 ,  0.65 ,  9.8  ],
        ...,
        [ 6.3  ,  0.51 ,  0.13 , ...,  3.42 ,  0.75 , 11.   ],
        [ 5.9  ,  0.645,  0.12 , ...,  3.57 ,  0.71 , 10.2  ],
        [ 6.   ,  0.31 ,  0.47 , ...,  3.39 ,  0.66 , 11.   ]]),
 array([5., 5., 5., ..., 6., 5., 6.]))
In [9]:
#Splitting the data into train and test sets
from sklearn.model_selection import StratifiedShuffleSplit
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(X, Y):
  X_train, X_test = X[train_index], X[test_index]
  y_train, y_test = Y[train_index], Y[test_index]
In [10]:
#Confirm that the sets are representative of population
pd.DataFrame(y_train, columns=["quality"]).hist()
Out[10]:
array([[<matplotlib.axes._subplots.AxesSubplot object at 0x7fec6daef8d0>]],
      dtype=object)
In [11]:
#Standardizing the features
standard_scalar = StandardScaler()
X_train_scaled = standard_scalar.fit_transform(X_train)
X_test_scaled = standard_scalar.transform(X_test)

Using KNeighborsClassifier

Now we can start training the model using the KNeighborsClassifier. We will use RandomizedSearchCV to finetune the hyperparameters

In [12]:
param_distribs = {
    'n_neighbors': randint(low=1, high=200),
    'weights': ['uniform', 'distance'],
}
knn_clf = KNeighborsClassifier()
rnd_search_cv = RandomizedSearchCV(knn_clf, param_distributions=param_distribs,  n_iter=100, random_state=42, verbose=0 )
rnd_search_cv.fit(X_train_scaled, y_train)
Out[12]:
RandomizedSearchCV(estimator=KNeighborsClassifier(), n_iter=100,
                   param_distributions={'n_neighbors': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7fec6da09a10>,
                                        'weights': ['uniform', 'distance']},
                   random_state=42)
In [13]:
best_knn_model = rnd_search_cv.best_estimator_
best_knn_model
Out[13]:
KNeighborsClassifier(n_neighbors=96, weights='distance')

Since dataset is skewed, accuracy is not a good performance measure. We will evaulate performance using precision and recall.

In [14]:
from sklearn.metrics import classification_report, plot_confusion_matrix

y_pred = best_knn_model.predict(X_test_scaled)
print(classification_report(y_test, y_pred))
              precision    recall  f1-score   support

         3.0       0.00      0.00      0.00         2
         4.0       0.00      0.00      0.00        11
         5.0       0.74      0.76      0.75       136
         6.0       0.62      0.77      0.69       128
         7.0       0.76      0.40      0.52        40
         8.0       1.00      0.33      0.50         3

    accuracy                           0.68       320
   macro avg       0.52      0.38      0.41       320
weighted avg       0.67      0.68      0.66       320

/opt/conda/lib/python3.7/site-packages/sklearn/metrics/_classification.py:1221: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))

This shows that, on average for each class, KNN is able to classify correctly 67% of the time and has a sensitivity of 68%.

Using a confusion matrix, we can visualise how well our model predicts the wine quality:

In [15]:
fig, ax = plt.subplots(figsize=(20, 10))
plot_confusion_matrix(best_knn_model, X_test_scaled, y_test,cmap=plt.cm.Blues, ax=ax)
plt.show()

Using LinearSVC

In [16]:
from sklearn.svm import LinearSVC
In [17]:
param_distribs = {
    "C": np.arange(1,3,.1)
}
lin_svc = LinearSVC(dual=False)
rnd_search_cv = RandomizedSearchCV(lin_svc, param_distributions=param_distribs, random_state=42, verbose=0 )
rnd_search_cv.fit(X_train_scaled, y_train)
Out[17]:
RandomizedSearchCV(estimator=LinearSVC(dual=False),
                   param_distributions={'C': array([1. , 1.1, 1.2, 1.3, 1.4, 1.5, 1.6, 1.7, 1.8, 1.9, 2. , 2.1, 2.2,
       2.3, 2.4, 2.5, 2.6, 2.7, 2.8, 2.9])},
                   random_state=42)
In [18]:
best_svc_model = rnd_search_cv.best_estimator_
best_svc_model
Out[18]:
LinearSVC(C=2.7000000000000015, dual=False)
In [19]:
rnd_search_cv.best_score_
Out[19]:
0.580125612745098
In [20]:
y_pred = best_svc_model.predict(X_test_scaled)
print(classification_report(y_test, y_pred))
              precision    recall  f1-score   support

         3.0       0.00      0.00      0.00         2
         4.0       0.00      0.00      0.00        11
         5.0       0.62      0.77      0.69       136
         6.0       0.51      0.59      0.55       128
         7.0       0.00      0.00      0.00        40
         8.0       0.00      0.00      0.00         3

    accuracy                           0.57       320
   macro avg       0.19      0.23      0.21       320
weighted avg       0.47      0.57      0.51       320

/opt/conda/lib/python3.7/site-packages/sklearn/metrics/_classification.py:1221: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))

SVC did not perform as well compared to KNN classifier. It has lower precision and recall

In [21]:
fig, ax = plt.subplots(figsize=(20, 10))
plot_confusion_matrix(best_svc_model, X_test_scaled, y_test,cmap=plt.cm.Blues, ax=ax)
plt.show()
In [ ]: