# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/red-wine-quality-cortez-et-al-2009/winequality-red.csv

import pandas as pd
import sklearn
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from scipy.stats import randint
from sklearn.preprocessing import StandardScaler
import numpy as np
import matplotlib.pyplot as plt
from sklearn.svm import LinearSVC

Gaining insights from the data¶

Lets open the dataset with pandas so we can better visualise

wine_csv = pd.read_csv("/kaggle/input/red-wine-quality-cortez-et-al-2009/winequality-red.csv")
wine_data = pd.DataFrame(wine_csv)
wine_data.head()

Next, lets check if there are any problems with the dataset such as null values or different row counts.

wine_data.describe()

wine_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1599 entries, 0 to 1598
Data columns (total 12 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   fixed acidity         1599 non-null   float64
 1   volatile acidity      1599 non-null   float64
 2   citric acid           1599 non-null   float64
 3   residual sugar        1599 non-null   float64
 4   chlorides             1599 non-null   float64
 5   free sulfur dioxide   1599 non-null   float64
 6   total sulfur dioxide  1599 non-null   float64
 7   density               1599 non-null   float64
 8   pH                    1599 non-null   float64
 9   sulphates             1599 non-null   float64
 10  alcohol               1599 non-null   float64
 11  quality               1599 non-null   int64  
dtypes: float64(11), int64(1)
memory usage: 150.0 KB

There are no null values and each column have the same number of rows. We don't have to clean the data

Lets see how well each feature correlates with quality:

corr_matrix = wine_data.corr()
corr_matrix["quality"].sort_values(ascending=False)

quality                 1.000000
alcohol                 0.476166
sulphates               0.251397
citric acid             0.226373
fixed acidity           0.124052
residual sugar          0.013732
free sulfur dioxide    -0.050656
pH                     -0.057731
chlorides              -0.128907
density                -0.174919
total sulfur dioxide   -0.185100
volatile acidity       -0.390558
Name: quality, dtype: float64

We will plot the data using a histogram to visualise the population inbalance at different quality levels:

wine_data["quality"].hist()

<matplotlib.axes._subplots.AxesSubplot at 0x7fec6db83310>

Due to the population inbalance, we cannot use random sampling to split the dataset as this would lead to biases during training. To ensure that the samples are representative of the population, we shall use stratified sampling. Lets start preparing the data.

Preparing the data¶

wine_data_array = np.array(wine_data)
X = wine_data_array[:, :-1]
Y = wine_data_array[:, -1:]
Y = Y.ravel()
X, Y

(array([[ 7.4  ,  0.7  ,  0.   , ...,  3.51 ,  0.56 ,  9.4  ],
        [ 7.8  ,  0.88 ,  0.   , ...,  3.2  ,  0.68 ,  9.8  ],
        [ 7.8  ,  0.76 ,  0.04 , ...,  3.26 ,  0.65 ,  9.8  ],
        ...,
        [ 6.3  ,  0.51 ,  0.13 , ...,  3.42 ,  0.75 , 11.   ],
        [ 5.9  ,  0.645,  0.12 , ...,  3.57 ,  0.71 , 10.2  ],
        [ 6.   ,  0.31 ,  0.47 , ...,  3.39 ,  0.66 , 11.   ]]),
 array([5., 5., 5., ..., 6., 5., 6.]))

#Splitting the data into train and test sets
from sklearn.model_selection import StratifiedShuffleSplit
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(X, Y):
  X_train, X_test = X[train_index], X[test_index]
  y_train, y_test = Y[train_index], Y[test_index]

#Confirm that the sets are representative of population
pd.DataFrame(y_train, columns=["quality"]).hist()

array([[<matplotlib.axes._subplots.AxesSubplot object at 0x7fec6daef8d0>]],
      dtype=object)

#Standardizing the features
standard_scalar = StandardScaler()
X_train_scaled = standard_scalar.fit_transform(X_train)
X_test_scaled = standard_scalar.transform(X_test)

Using KNeighborsClassifier¶

Now we can start training the model using the KNeighborsClassifier. We will use RandomizedSearchCV to finetune the hyperparameters

param_distribs = {
    'n_neighbors': randint(low=1, high=200),
    'weights': ['uniform', 'distance'],
}
knn_clf = KNeighborsClassifier()
rnd_search_cv = RandomizedSearchCV(knn_clf, param_distributions=param_distribs,  n_iter=100, random_state=42, verbose=0 )
rnd_search_cv.fit(X_train_scaled, y_train)

RandomizedSearchCV(estimator=KNeighborsClassifier(), n_iter=100,
                   param_distributions={'n_neighbors': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7fec6da09a10>,
                                        'weights': ['uniform', 'distance']},
                   random_state=42)

best_knn_model = rnd_search_cv.best_estimator_
best_knn_model

KNeighborsClassifier(n_neighbors=96, weights='distance')

Since dataset is skewed, accuracy is not a good performance measure. We will evaulate performance using precision and recall.

from sklearn.metrics import classification_report, plot_confusion_matrix

y_pred = best_knn_model.predict(X_test_scaled)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

         3.0       0.00      0.00      0.00         2
         4.0       0.00      0.00      0.00        11
         5.0       0.74      0.76      0.75       136
         6.0       0.62      0.77      0.69       128
         7.0       0.76      0.40      0.52        40
         8.0       1.00      0.33      0.50         3

    accuracy                           0.68       320
   macro avg       0.52      0.38      0.41       320
weighted avg       0.67      0.68      0.66       320

/opt/conda/lib/python3.7/site-packages/sklearn/metrics/_classification.py:1221: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))

This shows that, on average for each class, KNN is able to classify correctly 67% of the time and has a sensitivity of 68%.

Using a confusion matrix, we can visualise how well our model predicts the wine quality:

fig, ax = plt.subplots(figsize=(20, 10))
plot_confusion_matrix(best_knn_model, X_test_scaled, y_test,cmap=plt.cm.Blues, ax=ax)
plt.show()

Using LinearSVC¶

from sklearn.svm import LinearSVC

param_distribs = {
    "C": np.arange(1,3,.1)
}
lin_svc = LinearSVC(dual=False)
rnd_search_cv = RandomizedSearchCV(lin_svc, param_distributions=param_distribs, random_state=42, verbose=0 )
rnd_search_cv.fit(X_train_scaled, y_train)

RandomizedSearchCV(estimator=LinearSVC(dual=False),
                   param_distributions={'C': array([1. , 1.1, 1.2, 1.3, 1.4, 1.5, 1.6, 1.7, 1.8, 1.9, 2. , 2.1, 2.2,
       2.3, 2.4, 2.5, 2.6, 2.7, 2.8, 2.9])},
                   random_state=42)

best_svc_model = rnd_search_cv.best_estimator_
best_svc_model

LinearSVC(C=2.7000000000000015, dual=False)

rnd_search_cv.best_score_

0.580125612745098

y_pred = best_svc_model.predict(X_test_scaled)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

         3.0       0.00      0.00      0.00         2
         4.0       0.00      0.00      0.00        11
         5.0       0.62      0.77      0.69       136
         6.0       0.51      0.59      0.55       128
         7.0       0.00      0.00      0.00        40
         8.0       0.00      0.00      0.00         3

    accuracy                           0.57       320
   macro avg       0.19      0.23      0.21       320
weighted avg       0.47      0.57      0.51       320

/opt/conda/lib/python3.7/site-packages/sklearn/metrics/_classification.py:1221: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))

SVC did not perform as well compared to KNN classifier. It has lower precision and recall

fig, ax = plt.subplots(figsize=(20, 10))
plot_confusion_matrix(best_svc_model, X_test_scaled, y_test,cmap=plt.cm.Blues, ax=ax)
plt.show()

	fixed acidity	volatile acidity	citric acid	residual sugar	chlorides	free sulfur dioxide	total sulfur dioxide	density	pH	sulphates	alcohol	quality
count	1599.000000	1599.000000	1599.000000	1599.000000	1599.000000	1599.000000	1599.000000	1599.000000	1599.000000	1599.000000	1599.000000	1599.000000
mean	8.319637	0.527821	0.270976	2.538806	0.087467	15.874922	46.467792	0.996747	3.311113	0.658149	10.422983	5.636023
std	1.741096	0.179060	0.194801	1.409928	0.047065	10.460157	32.895324	0.001887	0.154386	0.169507	1.065668	0.807569
min	4.600000	0.120000	0.000000	0.900000	0.012000	1.000000	6.000000	0.990070	2.740000	0.330000	8.400000	3.000000
25%	7.100000	0.390000	0.090000	1.900000	0.070000	7.000000	22.000000	0.995600	3.210000	0.550000	9.500000	5.000000
50%	7.900000	0.520000	0.260000	2.200000	0.079000	14.000000	38.000000	0.996750	3.310000	0.620000	10.200000	6.000000
75%	9.200000	0.640000	0.420000	2.600000	0.090000	21.000000	62.000000	0.997835	3.400000	0.730000	11.100000	6.000000
max	15.900000	1.580000	1.000000	15.500000	0.611000	72.000000	289.000000	1.003690	4.010000	2.000000	14.900000	8.000000

	fixed acidity	volatile acidity	citric acid	residual sugar	chlorides	free sulfur dioxide	total sulfur dioxide	density	pH	sulphates	alcohol	quality
0	7.4	0.70	0.00	1.9	0.076	11.0	34.0	0.9978	3.51	0.56	9.4	5
1	7.8	0.88	0.00	2.6	0.098	25.0	67.0	0.9968	3.20	0.68	9.8	5
2	7.8	0.76	0.04	2.3	0.092	15.0	54.0	0.9970	3.26	0.65	9.8	5
3	11.2	0.28	0.56	1.9	0.075	17.0	60.0	0.9980	3.16	0.58	9.8	6
4	7.4	0.70	0.00	1.9	0.076	11.0	34.0	0.9978	3.51	0.56	9.4	5