In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session
/kaggle/input/sf-crime/train.csv.zip
/kaggle/input/sf-crime/test.csv.zip
/kaggle/input/sf-crime/sampleSubmission.csv.zip

Insights

Lets start by visualising the datasets in pandas

In [2]:
test_set_full_raw = pd.read_csv("../input/sf-crime/test.csv.zip", compression="zip", index_col='Id')
train_set_full_raw = pd.read_csv("../input/sf-crime/train.csv.zip", compression="zip")
In [3]:
train_set_full_raw.head()
Out[3]:
Dates Category Descript DayOfWeek PdDistrict Resolution Address X Y
0 2015-05-13 23:53:00 WARRANTS WARRANT ARREST Wednesday NORTHERN ARREST, BOOKED OAK ST / LAGUNA ST -122.425892 37.774599
1 2015-05-13 23:53:00 OTHER OFFENSES TRAFFIC VIOLATION ARREST Wednesday NORTHERN ARREST, BOOKED OAK ST / LAGUNA ST -122.425892 37.774599
2 2015-05-13 23:33:00 OTHER OFFENSES TRAFFIC VIOLATION ARREST Wednesday NORTHERN ARREST, BOOKED VANNESS AV / GREENWICH ST -122.424363 37.800414
3 2015-05-13 23:30:00 LARCENY/THEFT GRAND THEFT FROM LOCKED AUTO Wednesday NORTHERN NONE 1500 Block of LOMBARD ST -122.426995 37.800873
4 2015-05-13 23:30:00 LARCENY/THEFT GRAND THEFT FROM LOCKED AUTO Wednesday PARK NONE 100 Block of BRODERICK ST -122.438738 37.771541
In [4]:
#Check what columns are present in the test set.
train_set_full_raw.columns
Out[4]:
Index(['Dates', 'Category', 'Descript', 'DayOfWeek', 'PdDistrict',
       'Resolution', 'Address', 'X', 'Y'],
      dtype='object')

We will check if the train set contains any null values or missing rows. The data looks fine here so no data cleaning needed.

In [5]:
train_set_full_raw.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 878049 entries, 0 to 878048
Data columns (total 9 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   Dates       878049 non-null  object 
 1   Category    878049 non-null  object 
 2   Descript    878049 non-null  object 
 3   DayOfWeek   878049 non-null  object 
 4   PdDistrict  878049 non-null  object 
 5   Resolution  878049 non-null  object 
 6   Address     878049 non-null  object 
 7   X           878049 non-null  float64
 8   Y           878049 non-null  float64
dtypes: float64(2), object(7)
memory usage: 60.3+ MB
In [6]:
train_set_full_raw.describe()
Out[6]:
X Y
count 878049.000000 878049.000000
mean -122.422616 37.771020
std 0.030354 0.456893
min -122.513642 37.707879
25% -122.432952 37.752427
50% -122.416420 37.775421
75% -122.406959 37.784369
max -120.500000 90.000000

However, looking at the summary of X, Y columns, there seems to be an outlier. Most of the Y falls around the 37 range but there is one at 90. Lets remove the 90 from our dataset

In [7]:
ts = train_set_full_raw.copy()
ts = ts[ts["Y"]<90]

By plotting the X, Y coordinates we can see that the crimes cluster at certain regions of SF.

In [8]:
ts.plot(x="X", y="Y", kind="scatter", alpha=0.01,figsize=(15,12))
Out[8]:
<matplotlib.axes._subplots.AxesSubplot at 0x7fdd28a447d0>

We can also see the most common crime categories recorded and what are their most common resolutions

In [9]:
ts["Category"].value_counts()
Out[9]:
LARCENY/THEFT                  174885
OTHER OFFENSES                 126165
NON-CRIMINAL                    92300
ASSAULT                         76872
DRUG/NARCOTIC                   53971
VEHICLE THEFT                   53772
VANDALISM                       44724
WARRANTS                        42206
BURGLARY                        36754
SUSPICIOUS OCC                  31412
MISSING PERSON                  25989
ROBBERY                         22999
FRAUD                           16679
FORGERY/COUNTERFEITING          10609
SECONDARY CODES                  9985
WEAPON LAWS                      8555
PROSTITUTION                     7484
TRESPASS                         7325
STOLEN PROPERTY                  4539
SEX OFFENSES FORCIBLE            4387
DISORDERLY CONDUCT               4318
DRUNKENNESS                      4280
RECOVERED VEHICLE                3138
KIDNAPPING                       2341
DRIVING UNDER THE INFLUENCE      2268
RUNAWAY                          1946
LIQUOR LAWS                      1903
ARSON                            1513
LOITERING                        1225
EMBEZZLEMENT                     1166
SUICIDE                           508
FAMILY OFFENSES                   491
BAD CHECKS                        406
BRIBERY                           289
EXTORTION                         256
SEX OFFENSES NON FORCIBLE         148
GAMBLING                          146
PORNOGRAPHY/OBSCENE MAT            22
TREA                                6
Name: Category, dtype: int64
In [10]:
print("Most common resolutions for each category in percentage\n")
for i in ts.groupby(["Category"])["Resolution"]:
  print('\033[95m'+i[0]+'\033[0m')
  print(round(i[1].value_counts()[:3]/i[1].count()*100,1))
  print()
Most common resolutions for each category in percentage

ARSON
NONE               82.5
ARREST, BOOKED     13.4
JUVENILE BOOKED     1.1
Name: Resolution, dtype: float64

ASSAULT
NONE              58.2
ARREST, BOOKED    30.5
ARREST, CITED      4.6
Name: Resolution, dtype: float64

BAD CHECKS
NONE                            79.1
PROSECUTED BY OUTSIDE AGENCY     6.2
ARREST, BOOKED                   6.2
Name: Resolution, dtype: float64

BRIBERY
ARREST, BOOKED                            55.0
NONE                                      37.4
DISTRICT ATTORNEY REFUSES TO PROSECUTE     2.1
Name: Resolution, dtype: float64

BURGLARY
NONE              83.9
ARREST, BOOKED    13.2
UNFOUNDED          0.7
Name: Resolution, dtype: float64

DISORDERLY CONDUCT
ARREST, BOOKED    34.9
NONE              31.6
ARREST, CITED     31.0
Name: Resolution, dtype: float64

DRIVING UNDER THE INFLUENCE
ARREST, BOOKED    85.5
ARREST, CITED      8.2
NONE               5.7
Name: Resolution, dtype: float64

DRUG/NARCOTIC
ARREST, BOOKED    81.1
NONE               8.6
ARREST, CITED      8.3
Name: Resolution, dtype: float64

DRUNKENNESS
ARREST, BOOKED    76.5
NONE              16.3
ARREST, CITED      5.1
Name: Resolution, dtype: float64

EMBEZZLEMENT
NONE                                      71.2
ARREST, BOOKED                            14.0
DISTRICT ATTORNEY REFUSES TO PROSECUTE     4.0
Name: Resolution, dtype: float64

EXTORTION
NONE                                      74.2
ARREST, BOOKED                            16.8
DISTRICT ATTORNEY REFUSES TO PROSECUTE     2.7
Name: Resolution, dtype: float64

FAMILY OFFENSES
NONE              57.8
ARREST, BOOKED    25.7
UNFOUNDED          4.7
Name: Resolution, dtype: float64

FORGERY/COUNTERFEITING
NONE                            62.7
ARREST, BOOKED                  18.3
PROSECUTED BY OUTSIDE AGENCY     8.6
Name: Resolution, dtype: float64

FRAUD
NONE              75.4
ARREST, BOOKED    11.4
NOT PROSECUTED     5.7
Name: Resolution, dtype: float64

GAMBLING
NONE              35.6
ARREST, BOOKED    26.7
ARREST, CITED     26.7
Name: Resolution, dtype: float64

KIDNAPPING
NONE                                      46.9
ARREST, BOOKED                            43.3
DISTRICT ATTORNEY REFUSES TO PROSECUTE     3.0
Name: Resolution, dtype: float64

LARCENY/THEFT
NONE              89.5
ARREST, BOOKED     5.7
ARREST, CITED      3.0
Name: Resolution, dtype: float64

LIQUOR LAWS
ARREST, BOOKED    47.5
ARREST, CITED     37.5
NONE              10.5
Name: Resolution, dtype: float64

LOITERING
ARREST, CITED     60.7
ARREST, BOOKED    26.8
NONE              10.8
Name: Resolution, dtype: float64

MISSING PERSON
LOCATED           57.8
NONE              36.1
ARREST, BOOKED     1.7
Name: Resolution, dtype: float64

NON-CRIMINAL
NONE                 74.2
PSYCHOPATHIC CASE    14.6
ARREST, BOOKED        3.4
Name: Resolution, dtype: float64

OTHER OFFENSES
ARREST, CITED     37.5
ARREST, BOOKED    32.7
NONE              26.3
Name: Resolution, dtype: float64

PORNOGRAPHY/OBSCENE MAT
NONE              45.5
ARREST, BOOKED    22.7
ARREST, CITED     18.2
Name: Resolution, dtype: float64

PROSTITUTION
ARREST, CITED     63.4
ARREST, BOOKED    31.1
NONE               4.0
Name: Resolution, dtype: float64

RECOVERED VEHICLE
NONE               93.3
ARREST, BOOKED      6.2
JUVENILE BOOKED     0.2
Name: Resolution, dtype: float64

ROBBERY
NONE               76.6
ARREST, BOOKED     16.9
JUVENILE BOOKED     2.3
Name: Resolution, dtype: float64

RUNAWAY
LOCATED            62.5
NONE               33.8
JUVENILE BOOKED     1.4
Name: Resolution, dtype: float64

SECONDARY CODES
NONE                                54.4
ARREST, BOOKED                      32.5
COMPLAINANT REFUSES TO PROSECUTE     3.2
Name: Resolution, dtype: float64

SEX OFFENSES FORCIBLE
NONE                                      52.3
ARREST, BOOKED                            20.5
DISTRICT ATTORNEY REFUSES TO PROSECUTE     8.3
Name: Resolution, dtype: float64

SEX OFFENSES NON FORCIBLE
NONE                                      46.6
ARREST, BOOKED                            21.6
DISTRICT ATTORNEY REFUSES TO PROSECUTE    10.1
Name: Resolution, dtype: float64

STOLEN PROPERTY
ARREST, BOOKED     80.6
NONE               12.4
JUVENILE BOOKED     2.3
Name: Resolution, dtype: float64

SUICIDE
NONE                 72.0
PSYCHOPATHIC CASE    20.3
ARREST, BOOKED        3.3
Name: Resolution, dtype: float64

SUSPICIOUS OCC
NONE              87.7
UNFOUNDED          3.2
ARREST, BOOKED     2.9
Name: Resolution, dtype: float64

TREA
ARREST, BOOKED    50.0
NONE              50.0
Name: Resolution, dtype: float64

TRESPASS
ARREST, CITED     35.2
ARREST, BOOKED    32.2
NONE              29.6
Name: Resolution, dtype: float64

VANDALISM
NONE              87.7
ARREST, BOOKED     6.4
ARREST, CITED      3.2
Name: Resolution, dtype: float64

VEHICLE THEFT
NONE              91.6
UNFOUNDED          4.0
ARREST, BOOKED     3.8
Name: Resolution, dtype: float64

WARRANTS
ARREST, BOOKED     91.9
NONE                5.2
JUVENILE BOOKED     1.5
Name: Resolution, dtype: float64

WEAPON LAWS
ARREST, BOOKED    56.7
NONE              27.5
ARREST, CITED      6.9
Name: Resolution, dtype: float64

The dataset gave us the dates of the crime. However, a more useful feature would be the hour the crime occured. Therefore, lets add it to our dataset.

In [11]:
from datetime import datetime

ts["Hour"] = ts.Dates.apply(lambda date_string: date_string[11:-6])
ts.head()
Out[11]:
Dates Category Descript DayOfWeek PdDistrict Resolution Address X Y Hour
0 2015-05-13 23:53:00 WARRANTS WARRANT ARREST Wednesday NORTHERN ARREST, BOOKED OAK ST / LAGUNA ST -122.425892 37.774599 23
1 2015-05-13 23:53:00 OTHER OFFENSES TRAFFIC VIOLATION ARREST Wednesday NORTHERN ARREST, BOOKED OAK ST / LAGUNA ST -122.425892 37.774599 23
2 2015-05-13 23:33:00 OTHER OFFENSES TRAFFIC VIOLATION ARREST Wednesday NORTHERN ARREST, BOOKED VANNESS AV / GREENWICH ST -122.424363 37.800414 23
3 2015-05-13 23:30:00 LARCENY/THEFT GRAND THEFT FROM LOCKED AUTO Wednesday NORTHERN NONE 1500 Block of LOMBARD ST -122.426995 37.800873 23
4 2015-05-13 23:30:00 LARCENY/THEFT GRAND THEFT FROM LOCKED AUTO Wednesday PARK NONE 100 Block of BRODERICK ST -122.438738 37.771541 23

Lets see which day of the week and hour crime occurs the most:

In [12]:
ts.groupby(["DayOfWeek"])["Hour"].value_counts()
Out[12]:
DayOfWeek  Hour
Friday     18      8711
           17      8381
           19      7785
           12      7774
           16      7658
                   ... 
Wednesday  02      2377
           06      2054
           03      1701
           05      1303
           04      1268
Name: Hour, Length: 168, dtype: int64

Preprocessing

Now we can start preparing our dataset for training. We will only extract the relevant columns into our train and test set.

In [13]:
import numpy as np

train_full = ts.copy()
X_train_full, y_train_full = np.array(train_full[["DayOfWeek", "PdDistrict", "X", "Y", "Hour"]]), np.array(train_full[["Category"]])
y_train_full = y_train_full.ravel()
X_test = test_set_full_raw.copy()
X_test["Hour"] = X_test.Dates.apply(lambda date_string: date_string[11:-6])
X_test = X_test.drop(columns=["Dates", "Address"])
X_test = np.array(X_test)

We further split the full training set into train and validation set. We use Stratified sampling to ensure that the training and val set contains a proper representation of the categories present in the total population

In [14]:
import sklearn
from sklearn.model_selection import StratifiedShuffleSplit

sss = StratifiedShuffleSplit(n_splits=1, test_size=0.25, random_state=42)

for train_index, test_index in sss.split(X_train_full, y_train_full):
  X_train, y_train = X_train_full[train_index], y_train_full[train_index]
  X_val, y_val = X_train_full[test_index], y_train_full[test_index]

Next, lets set a pipeline to preprocess the datasets. StandardScaler() to normalise the numerical atttributes and OneHotEncoder() to convert the categorical attributes to arrays.

In [15]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

num_attribs = [2,3]
cat_attribs = [0,1,4]

num_pipeline = Pipeline([
                         ('std_scaler', StandardScaler())
])

full_pipeline = ColumnTransformer([
                                 ('num', num_pipeline, num_attribs),
                                 ('cat', OneHotEncoder(), cat_attribs) 
])

X_train_prepared = full_pipeline.fit_transform(X_train)
X_val_prepared = full_pipeline.transform(X_val)
X_test_prepared = full_pipeline.transform(X_test)

Select and train model

In [16]:
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier,VotingClassifier
import xgboost
In [17]:
X_train_prepared.shape
Out[17]:
(658486, 43)

The train set contains 658 486 rows which lead to slow training time for me. Therefore, we will reduce the training set to 100 000 rows use Stratified Sampling again to get a good representation of the population.

In [18]:
ss = StratifiedShuffleSplit(n_splits=1, train_size=100_000, random_state=42)
for train_index, _ in ss.split(X_train_prepared, y_train):
  X_train_prepared_small, y_train_small = X_train_prepared[train_index], y_train[train_index].ravel()

X_train_prepared_small.shape, y_train_small.shape
Out[18]:
((100000, 43), (100000,))

Ensemble Learning aggregates the prediction of a group of predictors. We usually get better results from it compared with just a single best individual predictor.

From a prior, not very thorough, testing with different classifiers such as LinearSVC, BaggingClassifier, ExtraTreesClassifier. I found that Ensemble learning with XGBoost and RandomForest yield the best results.

In [19]:
rf_clf = RandomForestClassifier(max_depth=16, random_state=42, n_jobs=-1, verbose=3)
xg_clf = xgboost.XGBClassifier()

estimators = [
            ("rf", rf_clf),
            ("xg", xg_clf)
]

voting_clf = VotingClassifier(estimators, n_jobs=-1, voting="soft")
voting_clf.fit(X_train_prepared_small, y_train_small)
voting_clf.score(X_val_prepared, y_val)
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  24 tasks      | elapsed:    0.9s
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:    3.3s finished
Out[19]:
0.2659866239020301

Finally, lets create the csv file for submission. This submission should give us a score of about 2.506:

In [20]:
y_pred = voting_clf.predict_proba(X_test_prepared)
pred_df = pd.DataFrame(y_pred, columns=[voting_clf.classes_])
pred_df["Id"]= list(range(pred_df.shape[0]))
pred_df.to_csv("crime_pred_02.zip", compression="zip", index=False)
pred_df.head()
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  24 tasks      | elapsed:    3.5s
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:   12.9s finished
Out[20]:
ARSON ASSAULT BAD CHECKS BRIBERY BURGLARY DISORDERLY CONDUCT DRIVING UNDER THE INFLUENCE DRUG/NARCOTIC DRUNKENNESS EMBEZZLEMENT ... STOLEN PROPERTY SUICIDE SUSPICIOUS OCC TREA TRESPASS VANDALISM VEHICLE THEFT WARRANTS WEAPON LAWS Id
0 0.002600 0.092769 0.000130 0.000276 0.022762 0.001422 0.002095 0.027841 0.002173 0.000773 ... 0.014149 0.000085 0.022919 6.997829e-07 0.002507 0.073337 0.130898 0.049242 0.013172 0
1 0.004448 0.120573 0.000145 0.000698 0.014472 0.001840 0.005577 0.095367 0.003332 0.000241 ... 0.002835 0.000099 0.025346 6.374235e-07 0.003044 0.059236 0.054834 0.088342 0.032687 1
2 0.000588 0.063818 0.000083 0.000024 0.081312 0.001652 0.002970 0.019283 0.004084 0.000335 ... 0.003394 0.000675 0.024432 6.033798e-07 0.004649 0.066084 0.105820 0.025941 0.004329 2
3 0.001465 0.102073 0.000262 0.000278 0.018288 0.001526 0.004698 0.031347 0.002733 0.002787 ... 0.008294 0.000525 0.025992 2.303354e-06 0.003507 0.070705 0.191155 0.040288 0.021307 3
4 0.001465 0.102073 0.000262 0.000278 0.018288 0.001526 0.004698 0.031347 0.002733 0.002787 ... 0.008294 0.000525 0.025992 2.303354e-06 0.003507 0.070705 0.191155 0.040288 0.021307 4

5 rows × 40 columns