Environment Setting

Preparation for data process

Code

# Data manipulation libraries
import os
import pandas as pd
import numpy as np

# Visualization
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

# Timer
from tqdm import tqdm, tqdm_notebook

# Regression
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import GradientBoostingRegressor
from xgboost import XGBRegressor 
from sklearn.neural_network import MLPRegressor
from sklearn.neighbors import KNeighborsRegressor

# Model support functions
from sklearn import model_selection
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from scipy.stats import uniform
from sklearn.preprocessing import StandardScaler 
from pprint import pprint
from sklearn.inspection import permutation_importance
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Geo-related libraries
import geopandas as gpd
import folium
from folium.plugins import HeatMap
import geopy
from geopy.geocoders import Nominatim
from geopy.extra.rate_limiter import RateLimiter
import contextily as ctx
import geofeather
from geopandas import GeoDataFrame
from shapely.geometry import Point
from shapely import wkt
from shapely.geometry import Point, MultiPoint
from shapely.ops import nearest_points
from shapely import wkt

import osmnx

Code

from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

Code

#https://drive.google.com/file/d/1HJBU_-Yg_ylAzab8v11l1wcYEwBhO41-/view?usp=share_link
id = "1HJBU_-Yg_ylAzab8v11l1wcYEwBhO41-"
file = drive.CreateFile({'id':id}) 
file.GetContentFile('Police_Department_Incident_Reports__2018_to_Present.csv')

Data brief review

Code

#load crime data
crime_df = pd.read_csv('Police_Department_Incident_Reports__2018_to_Present.csv')
crime_df.head()

	Incident Datetime	Incident Date	Incident Time	Incident Year	Incident Day of Week	Report Datetime	Row ID	Incident ID	Incident Number	CAD Number	...	Longitude	Point	Neighborhoods	ESNCAG - Boundary File	Central Market/Tenderloin Boundary Polygon - Updated	Civic Center Harm Reduction Project Boundary	HSOC Zones as of 2018-06-05	Invest In Neighborhoods (IIN) Areas	Current Supervisor Districts	Current Police Districts
0	2023/03/13 11:41:00 PM	2023/03/13	23:41	2023	Monday	2023/03/13 11:41:00 PM	125373607041	1253736	230167874	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
1	2023/03/01 05:02:00 AM	2023/03/01	05:02	2023	Wednesday	2023/03/11 03:40:00 PM	125379506374	1253795	236046151	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
2	2023/03/13 01:16:00 PM	2023/03/13	13:16	2023	Monday	2023/03/13 01:17:00 PM	125357107041	1253571	220343896	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
3	2023/03/13 10:59:00 AM	2023/03/13	10:59	2023	Monday	2023/03/13 11:00:00 AM	125355107041	1253551	230174885	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
4	2023/03/14 06:44:00 PM	2023/03/14	18:44	2023	Tuesday	2023/03/14 06:45:00 PM	125402407041	1254024	230176728	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN

5 rows × 35 columns

Code

crime_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 716680 entries, 0 to 716679
Data columns (total 35 columns):
 #   Column                                                Non-Null Count   Dtype  
---  ------                                                --------------   -----  
 0   Incident Datetime                                     716680 non-null  object 
 1   Incident Date                                         716680 non-null  object 
 2   Incident Time                                         716680 non-null  object 
 3   Incident Year                                         716680 non-null  int64  
 4   Incident Day of Week                                  716680 non-null  object 
 5   Report Datetime                                       716680 non-null  object 
 6   Row ID                                                716680 non-null  int64  
 7   Incident ID                                           716680 non-null  int64  
 8   Incident Number                                       716680 non-null  int64  
 9   CAD Number                                            555602 non-null  float64
 10  Report Type Code                                      716680 non-null  object 
 11  Report Type Description                               716680 non-null  object 
 12  Filed Online                                          144690 non-null  object 
 13  Incident Code                                         716680 non-null  int64  
 14  Incident Category                                     716064 non-null  object 
 15  Incident Subcategory                                  716064 non-null  object 
 16  Incident Description                                  716680 non-null  object 
 17  Resolution                                            716680 non-null  object 
 18  Intersection                                          678513 non-null  object 
 19  CNN                                                   678513 non-null  float64
 20  Police District                                       716680 non-null  object 
 21  Analysis Neighborhood                                 678381 non-null  object 
 22  Supervisor District                                   678138 non-null  float64
 23  Supervisor District 2012                              678393 non-null  float64
 24  Latitude                                              678513 non-null  float64
 25  Longitude                                             678513 non-null  float64
 26  Point                                                 678513 non-null  object 
 27  Neighborhoods                                         664168 non-null  float64
 28  ESNCAG - Boundary File                                7761 non-null    float64
 29  Central Market/Tenderloin Boundary Polygon - Updated  92127 non-null   float64
 30  Civic Center Harm Reduction Project Boundary          91531 non-null   float64
 31  HSOC Zones as of 2018-06-05                           149426 non-null  float64
 32  Invest In Neighborhoods (IIN) Areas                   0 non-null       float64
 33  Current Supervisor Districts                          678393 non-null  float64
 34  Current Police Districts                              677736 non-null  float64
dtypes: float64(14), int64(5), object(16)
memory usage: 191.4+ MB

Code

#convert the pandas file to geopandas file
crime_df_geo = gpd.GeoDataFrame(crime_df, geometry=gpd.points_from_xy(crime_df.Longitude, crime_df.Latitude), crs={'init' :'epsg:4326'})
crime_df_geo.info()

/usr/local/lib/python3.9/dist-packages/pyproj/crs/crs.py:141: FutureWarning: '+init=<authority>:<code>' syntax is deprecated. '<authority>:<code>' is the preferred initialization method. When making the change, be mindful of axis order changes: https://pyproj4.github.io/pyproj/stable/gotchas.html#axis-order-changes-in-proj-6
  in_crs_string = _prepare_from_proj_string(in_crs_string)

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 716680 entries, 0 to 716679
Data columns (total 36 columns):
 #   Column                                                Non-Null Count   Dtype   
---  ------                                                --------------   -----   
 0   Incident Datetime                                     716680 non-null  object  
 1   Incident Date                                         716680 non-null  object  
 2   Incident Time                                         716680 non-null  object  
 3   Incident Year                                         716680 non-null  int64   
 4   Incident Day of Week                                  716680 non-null  object  
 5   Report Datetime                                       716680 non-null  object  
 6   Row ID                                                716680 non-null  int64   
 7   Incident ID                                           716680 non-null  int64   
 8   Incident Number                                       716680 non-null  int64   
 9   CAD Number                                            555602 non-null  float64 
 10  Report Type Code                                      716680 non-null  object  
 11  Report Type Description                               716680 non-null  object  
 12  Filed Online                                          144690 non-null  object  
 13  Incident Code                                         716680 non-null  int64   
 14  Incident Category                                     716064 non-null  object  
 15  Incident Subcategory                                  716064 non-null  object  
 16  Incident Description                                  716680 non-null  object  
 17  Resolution                                            716680 non-null  object  
 18  Intersection                                          678513 non-null  object  
 19  CNN                                                   678513 non-null  float64 
 20  Police District                                       716680 non-null  object  
 21  Analysis Neighborhood                                 678381 non-null  object  
 22  Supervisor District                                   678138 non-null  float64 
 23  Supervisor District 2012                              678393 non-null  float64 
 24  Latitude                                              678513 non-null  float64 
 25  Longitude                                             678513 non-null  float64 
 26  Point                                                 678513 non-null  object  
 27  Neighborhoods                                         664168 non-null  float64 
 28  ESNCAG - Boundary File                                7761 non-null    float64 
 29  Central Market/Tenderloin Boundary Polygon - Updated  92127 non-null   float64 
 30  Civic Center Harm Reduction Project Boundary          91531 non-null   float64 
 31  HSOC Zones as of 2018-06-05                           149426 non-null  float64 
 32  Invest In Neighborhoods (IIN) Areas                   0 non-null       float64 
 33  Current Supervisor Districts                          678393 non-null  float64 
 34  Current Police Districts                              677736 non-null  float64 
 35  geometry                                              716680 non-null  geometry
dtypes: float64(14), geometry(1), int64(5), object(16)
memory usage: 196.8+ MB

Code

#intial visualization
fig, ax = plt.subplots(figsize=(12, 10))
crime_df_geo.to_crs(epsg=3857).plot(ax = ax,
                figsize=(12,12),
                markersize=40,
               color="black",
               edgecolor="white",
               alpha=0.8,
               marker="o"
            );
ctx.add_basemap(ax, source=ctx.providers.OpenStreetMap.Mapnik)

Prev Analysis

The number of crime in different categories

Code

#calculate the number of crime in different categories
q1 = crime_df.groupby('Incident Category').size().reset_index(name='count')
q1 = q1.sort_values(by='count', ascending=False)
q1.head(10)

	Incident Category	count
18	Larceny Theft	218642
29	Other Miscellaneous	49878
21	Malicious Mischief	48503
1	Assault	43600
26	Non-Criminal	42605
2	Burglary	40225
24	Motor Vehicle Theft	36698
33	Recovered Vehicle	27881
12	Fraud	22995
20	Lost Property	20831

Code

#data visualization
plt.figure()
ax = q1.plot(kind = 'bar', x = 'Incident Category' , y = 'count', logy = True, color = 'red', legend = False, align = 'center')
ax.set_ylabel('count',fontsize = 12)
ax.set_xlabel('category',fontsize = 12)
plt.xticks(fontsize=5, rotation=90)
plt.title('Number of crimes for different categories')
display()

<Figure size 640x480 with 0 Axes>

From the number of crime in different categories, we can find that ‘Larceny Theft’ top the rank and significantly outnumber other kinds, folliwing ‘Malicious Mischief’ and ‘Assault’. the large number of theft, for one things, results from the many places where it can happen. For another, the change in Califonia policy for the determination of crime also lower cost of crime that causes lower property damage.

The number of crime in different PdDistricts

Code

#calculate the number of crime in different PdDistricts
q2 = crime_df.groupby('Police District').size().reset_index(name='count')
q2 = q2.sort_values(by='count', ascending=False)
q2.head(10)

	Police District	count
1	Central	107239
4	Northern	97302
3	Mission	89637
8	Southern	86064
10	Tenderloin	69222
0	Bayview	63678
2	Ingleside	54276
9	Taraval	50327
7	Richmond	44702
6	Park	32995

Code

#data visualization
plt.figure()
ax = q2.plot(kind = 'bar', x = 'Police District' , y = 'count', logy = True, color = 'yellow', legend = False, align = 'center')
ax.set_ylabel('count',fontsize = 12)
ax.set_xlabel('PdDistrict',fontsize = 12)
plt.xticks(fontsize=5, rotation=90)
plt.title('Number of crimes in different PdDistricts')
display()

<Figure size 640x480 with 0 Axes>

The trend of number of crime from 2018 to 2022

Code

# switch the form of time data for easier analysis 
crime_df[['year','month', 'day']] = crime_df['Incident Date'].str.split('/', expand=True)
crime_df['day'] = pd.to_numeric(crime_df['day'])
crime_df['month'] = pd.to_numeric(crime_df['month'])
crime_df['year'] = pd.to_numeric(crime_df['year'])
crime_df[['hour','minute']] = crime_df['Incident Time'].str.split(':', expand=True)
crime_df['hour'] = pd.to_numeric(crime_df['hour'])
crime_df['minute'] = pd.to_numeric(crime_df['minute'])

Code

#select the data from 2018 to 2022
year = [2018,2019,2020,2021,2022]
crime_df_new = crime_df[crime_df.year.isin(year)]
crime_df_new.head()

	Incident Datetime	Incident Date	Incident Time	Incident Year	Incident Day of Week	Report Datetime	Row ID	Incident ID	Incident Number	CAD Number	...	HSOC Zones as of 2018-06-05	Invest In Neighborhoods (IIN) Areas	Current Supervisor Districts	Current Police Districts	geometry	year	month	day	hour
11	2022/06/27 12:00:00 PM	2022/06/27	12:00	2022	Monday	2023/03/15 05:20:00 PM	125439371000	1254393	230184129	230742480.0	...	NaN	NaN	3.0	6.0	POINT (-122.40823 37.78736)	2022	6	27	12
25	2022/05/15 04:00:00 PM	2022/05/15	16:00	2022	Sunday	2022/05/17 07:32:00 PM	116100671000	1161006	226100018	NaN	...	NaN	NaN	NaN	NaN	POINT EMPTY	2022	5	15	16
35	2018/09/03 09:00:00 AM	2018/09/03	09:00	2018	Monday	2018/09/03 01:17:00 PM	73006271000	730062	186245900	NaN	...	NaN	NaN	NaN	NaN	POINT EMPTY	2018	9	3	9
36	2019/01/13 02:00:00 PM	2019/01/13	14:00	2019	Sunday	2019/01/14 12:10:00 PM	75982071000	759820	196009821	NaN	...	NaN	NaN	NaN	NaN	POINT EMPTY	2019	1	13	14
37	2018/11/02 12:00:00 PM	2018/11/02	12:00	2018	Friday	2018/11/03 02:21:00 PM	73376171000	733761	180833020	NaN	...	NaN	NaN	NaN	NaN	POINT EMPTY	2018	11	2	12

5 rows × 41 columns

Code

# calculate the monthly number of crime from 2018 to 2022 
q3 = crime_df_new.groupby(['year','month']).size().reset_index(name='count')
q3 = q3.sort_values(by=['year','month'], ascending=True)
q3['year-month'] = q3['year'].astype(str) + '-' + q3['month'].astype(str).str.zfill(2)
q3.head()

	year	month	count	year-month
0	2018	1	13274	2018-01
1	2018	2	11727	2018-02
2	2018	3	12565	2018-03
3	2018	4	12421	2018-04
4	2018	5	12847	2018-05

Code

#data visualization
plt.figure()
ax = q3.plot(kind = 'bar', x = 'year-month' , y = 'count', logy = True, color = 'yellow', legend = False, align = 'center')
ax.set_ylabel('count',fontsize = 12)
ax.set_xlabel('PdDistrict',fontsize = 12)
plt.xticks(fontsize=5, rotation=90)
plt.title('Change in number of crime from 2018 to 2022')
display()

<Figure size 640x480 with 0 Axes>

From the bar chart of crime number based on the month, first of all, we can find that the crime nunmber show a seasonality, which increase from every year’s March and reach a peak in September and then drop till next year’s March. Also, we can find a sharp decrease at the begining of 2020. We can assume that this situation was highly influenced by Covid-19, and the influence is gradually weakened after begining of 2020 but count of crime is less than pre-covid anually.

The hourly distribution of crime in a day?

Code

# count the number of crime in different hours
q4 = crime_df_new.groupby('hour').size().reset_index(name = 'count')
q4 = q4.sort_values(by='hour', ascending=True)
q4.head()

	hour	count
0	0	37081
1	1	18247
2	2	15878
3	3	12917
4	4	10887

Code

#chart visualization
plt.figure()
ax = q4.plot(kind = 'bar', x = 'hour' , y = 'count', logy = True, color = 'blue', legend = False, align = 'center')
ax.set_ylabel('count',fontsize = 12)
ax.set_xlabel('Hour',fontsize = 12)
plt.xticks(fontsize=5, rotation=90)
plt.title('Time distribution in a day of crime from 2018 to 2022')
display()

<Figure size 640x480 with 0 Axes>

From the number of crime change in a day, we can find that the frequency of crime have three peaks in a day: 12AM, 12PM, and 18PM. The peak time in 12AM and 18PM can be explained by the huge footfall during that time. However, the phenomenon that sudden increase in the 12AM and sharply drop after that time is hard to explained.

Code

#check the hourly distribution for different types of crime
q5 = crime_df_new.groupby(['hour','Incident Category']).size().reset_index(name = 'count')
q5 = q5.sort_values(by='count', ascending=False)
q5.head(24)

	hour	Incident Category	count
841	18	Larceny Theft	15632
887	19	Larceny Theft	14667
556	12	Larceny Theft	14128
794	17	Larceny Theft	13798
932	20	Larceny Theft	13176
510	11	Larceny Theft	12020
746	16	Larceny Theft	11591
698	15	Larceny Theft	11536
603	13	Larceny Theft	11491
650	14	Larceny Theft	11245
979	21	Larceny Theft	10208
463	10	Larceny Theft	9859
18	0	Larceny Theft	9570
1026	22	Larceny Theft	8337
417	9	Larceny Theft	7121
1073	23	Larceny Theft	6780
371	8	Larceny Theft	5897
65	1	Larceny Theft	4951
12	0	Fraud	4652
328	7	Larceny Theft	3591
111	2	Larceny Theft	3414
552	12	Fraud	3185
844	18	Malicious Mischief	2990
757	16	Other Miscellaneous	2905

Code

#check the hourly distribution for the 'Larceny Theft' category
q5_theft = q5[q5['Incident Category'].isin(['Larceny Theft'])]
q5_theft = q5_theft.sort_values(by = 'hour',ascending = True)
q5_theft.head()

	hour	Incident Category	count
18	0	Larceny Theft	9570
65	1	Larceny Theft	4951
111	2	Larceny Theft	3414
153	3	Larceny Theft	2746
198	4	Larceny Theft	2175

Code

#data visualization
plt.figure()
ax = q5_theft.plot(kind = 'bar', x = 'hour' , y = 'count', logy = True, color = 'blue', legend = False, align = 'center')
ax.set_ylabel('count',fontsize = 12)
ax.set_xlabel('Hour',fontsize = 12)
plt.xticks(fontsize=5, rotation=90)
plt.title('Time distribution in a day of theft from 2018 to 2022')
display()

<Figure size 640x480 with 0 Axes>

The Resolution situation of crime in different categories

Code

#create dummy variables based on the 'resolution' column 
crime_df_new['Resolution'].unique()
crime_df_new['Result'] = np.where(crime_df_new['Resolution'] == 'Open or Active', 0, 1)
crime_df_new.info()

SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  crime_df_new['Result'] = np.where(crime_df_new['Resolution'] == 'Open or Active', 0, 1)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 682474 entries, 11 to 716671
Data columns (total 42 columns):
 #   Column                                                Non-Null Count   Dtype   
---  ------                                                --------------   -----   
 0   Incident Datetime                                     682474 non-null  object  
 1   Incident Date                                         682474 non-null  object  
 2   Incident Time                                         682474 non-null  object  
 3   Incident Year                                         682474 non-null  int64   
 4   Incident Day of Week                                  682474 non-null  object  
 5   Report Datetime                                       682474 non-null  object  
 6   Row ID                                                682474 non-null  int64   
 7   Incident ID                                           682474 non-null  int64   
 8   Incident Number                                       682474 non-null  int64   
 9   CAD Number                                            528524 non-null  float64 
 10  Report Type Code                                      682474 non-null  object  
 11  Report Type Description                               682474 non-null  object  
 12  Filed Online                                          138643 non-null  object  
 13  Incident Code                                         682474 non-null  int64   
 14  Incident Category                                     681904 non-null  object  
 15  Incident Subcategory                                  681904 non-null  object  
 16  Incident Description                                  682474 non-null  object  
 17  Resolution                                            682474 non-null  object  
 18  Intersection                                          646095 non-null  object  
 19  CNN                                                   646095 non-null  float64 
 20  Police District                                       682474 non-null  object  
 21  Analysis Neighborhood                                 645967 non-null  object  
 22  Supervisor District                                   645740 non-null  float64 
 23  Supervisor District 2012                              645980 non-null  float64 
 24  Latitude                                              646095 non-null  float64 
 25  Longitude                                             646095 non-null  float64 
 26  Point                                                 646095 non-null  object  
 27  Neighborhoods                                         632302 non-null  float64 
 28  ESNCAG - Boundary File                                7453 non-null    float64 
 29  Central Market/Tenderloin Boundary Polygon - Updated  87260 non-null   float64 
 30  Civic Center Harm Reduction Project Boundary          86966 non-null   float64 
 31  HSOC Zones as of 2018-06-05                           142555 non-null  float64 
 32  Invest In Neighborhoods (IIN) Areas                   0 non-null       float64 
 33  Current Supervisor Districts                          645980 non-null  float64 
 34  Current Police Districts                              645342 non-null  float64 
 35  geometry                                              682474 non-null  geometry
 36  year                                                  682474 non-null  int64   
 37  month                                                 682474 non-null  int64   
 38  day                                                   682474 non-null  int64   
 39  hour                                                  682474 non-null  int64   
 40  minute                                                682474 non-null  int64   
 41  Result                                                682474 non-null  int64   
dtypes: float64(14), geometry(1), int64(11), object(16)
memory usage: 223.9+ MB

Code

# calculate the number of different kinds of crime that got resolution
q6 = crime_df_new.groupby(['Incident Category','Result']).size().reset_index(name = 'count')
q6 = q6.sort_values(by = 'Incident Category', ascending = True)
q6.head()
q6_solve = q6[q6.Result == 1]
q6_solve.head()
# merge the chart with number of crime to calculate the reolution rate
q6_merge = pd.merge(q6_solve, q1, on='Incident Category')
q6_merge['solve_rate'] = q6_merge.count_x/q6_merge.count_y
q6_merge.sort_values(by = 'solve_rate', ascending = True)

	Incident Category	Result	count_x	count_y	solve_rate
19	Lost Property	1	79	20831	0.003792
27	Other	1	67	5831	0.011490
36	Suicide	1	7	324	0.021605
37	Suspicious	1	3	102	0.029412
38	Suspicious Occ	1	474	14652	0.032351
10	Fire Report	1	35	1052	0.033270
17	Larceny Theft	1	7836	218642	0.035839
12	Fraud	1	955	22995	0.041531
22	Missing Person	1	697	15462	0.045078
23	Motor Vehicle Theft	1	1811	36698	0.049349
21	Miscellaneous Investigation	1	409	7457	0.054848
5	Courtesy Report	1	156	2101	0.074250
32	Recovered Vehicle	1	2745	27881	0.098454
20	Malicious Mischief	1	5109	48503	0.105334
31	Rape	1	19	172	0.110465
25	Non-Criminal	1	4790	42605	0.112428
2	Burglary	1	4617	40225	0.114779
9	Embezzlement	1	124	927	0.133765
33	Robbery	1	2986	15998	0.186648
0	Arson	1	450	2052	0.219298
41	Vandalism	1	389	1534	0.253585
29	Other Offenses	1	1654	6053	0.273253
11	Forgery And Counterfeiting	1	684	2461	0.277936
6	Disorderly Conduct	1	3325	11771	0.282474
1	Assault	1	12399	43600	0.284381
47	Weapons Offense	1	1653	4893	0.337830
26	Offences Against The Family And Children	1	3352	9862	0.339890
43	Vehicle Misplaced	1	104	297	0.350168
42	Vehicle Impounded	1	180	472	0.381356
16	Human Trafficking, Commercial Sex Acts	1	9	22	0.409091
39	Traffic Collision	1	739	1663	0.444378
28	Other Miscellaneous	1	26580	49878	0.532900
14	Homicide	1	61	106	0.575472
34	Sex Offense	1	445	746	0.596515
15	Human Trafficking (A), Commercial Sex Acts	1	78	130	0.600000
13	Gambling	1	20	33	0.606061
24	Motor Vehicle Theft?	1	47	74	0.635135
4	Civil Sidewalks	1	638	888	0.718468
18	Liquor Laws	1	57	77	0.740260
35	Stolen Property	1	2853	3563	0.800730
45	Weapons Carrying Etc	1	3498	4081	0.857143
7	Drug Offense	1	15127	17606	0.859196
30	Prostitution	1	811	941	0.861849
46	Weapons Offence	1	19	21	0.904762
44	Warrant	1	18952	20713	0.914981
8	Drug Violation	1	228	249	0.915663
40	Traffic Violation Arrest	1	6413	6890	0.930769
3	Case Closure	1	2804	2929	0.957323

From the resolution rate of different kinds of crime, we can also find that the top number of crime category also have a low resolution rate. ‘Larceny Theft’ only has 3% resolution rate, while ‘Marlicious Mischief’ has 10% resolution rate. We can conclude that the crime that people are most likely to confront in the daily life still have a huge influence on city life. Therefore, it’s necessary to perform a deeper analysis on which factor influence the conduct of crime.

Combine crime data with social-ecnomic data and Zoning & Construction data

clean the crime data

Code

#clean the crime dataset
crime_df_copy = crime_df_new.copy()
drop = ['Incident Datetime','Incident Date','Report Datetime','Row ID','Incident ID','Incident Number','CAD Number','Report Type Description','Filed Online','Incident Code','Incident Description',
        'CNN','Supervisor District','Central Market/Tenderloin Boundary Polygon - Updated','Civic Center Harm Reduction Project Boundary','HSOC Zones as of 2018-06-05','Invest In Neighborhoods (IIN) Areas',
        'Current Supervisor Districts','Current Police Districts']
crime_df_drop = crime_df_copy.drop(columns = drop)
crime_df_drop.info()

Code

#select the theft type data in 2020
crime_df_drop = crime_df_drop[crime_df_drop.year == 2020]
crime_df_drop = crime_df_drop[crime_df_drop['Incident Category'] == 'Larceny Theft']
crime_df_drop.reset_index()

Code

#get the geo boundary of San Francisco
sf_poly = osmnx.geocode_to_gdf('San Francisco, CA, USA')
sf_poly.plot()

Code

#convert to geopandas file
gpd_crime = gpd.GeoDataFrame(crime_df_drop, geometry = crime_df_drop['geometry'], crs={'init' :'epsg:4326'})
sf_poly.crs, gpd_crime.crs #.crs -- geodataframe

Code

#spatial join
gpd_crime_city = gpd.sjoin(gpd_crime,sf_poly,how='inner',op = 'intersects')
gpd_crime_city.tail(2)

Code

gpd_crime_city.info()

Code

#drop the useless data 
gpd_crime_city = gpd_crime_city.drop(['bbox_north','bbox_south','bbox_east','bbox_west','index_right'],axis = 1)

Create the Physical Environment Data

Street

Code

#get the street data of san francisco
sf_streets = osmnx.graph_from_place('San Francisco, California', network_type = 'drive')
nodes, edges = osmnx.graph_to_gdfs(sf_streets)

Code

nodes.head(1)

Code

edges.head(1)

Code

edges.plot()

Code

# brief information of road dataset
sf_road = edges.copy()
print(sf_road['highway'].value_counts())
print('Number of rows is ' + str(sf_road.shape[0]))
print('Number of columns is ' + str(sf_road.shape[1]))

Code

# Combining similar road types to reduce classification number
sf_road['highway'] = sf_road['highway'].str.replace('_link','')
sf_road['highway'] = np.where(sf_road['highway']=='trunk','secondary',sf_road['highway'])
sf_road['highway'] = np.where(sf_road['highway']=='living_street','residential',sf_road['highway'])
print(sf_road['highway'].value_counts())

Code

#based on road type to create sub-dataset
sf_highways = sf_road[sf_road.highway == 'motorway']
sf_pry = sf_road[sf_road.highway == 'primary']
sf_second = sf_road[sf_road.highway == 'secondary']
sf_resid = sf_road[sf_road.highway == 'residential']
sf_tertiary = sf_road[sf_road.highway == 'tertiary']

sf_highways.crs

Code

# convert to geopandas files and set function to calcualte the min distance from point to road
gpd_crime_city_utm = gpd_crime_city.to_crs({'init': 'epsg:32610'}).copy()   
highway_utm = sf_highways.to_crs({'init': 'epsg:32610'}).copy()
primary_utm = sf_pry.to_crs({'init': 'epsg:32610'}).copy()
secondary_utm = sf_second.to_crs({'init': 'epsg:32610'}).copy()
tertiary_utm = sf_tertiary.to_crs({'init': 'epsg:32610'}).copy()
residential_utm = sf_resid.to_crs({'init': 'epsg:32610'}).copy()
sf_road_utm = sf_road.to_crs({'init': 'epsg:32610'}).copy()

def distance_to_roadway(gps,roadway):
  dists = []
  for i in roadway.geometry:
    dists.append(i.distance(gps))
  return (np.min(dists))

Code

tqdm.pandas()
gpd_crime_city['Closest_highway'] = gpd_crime_city_utm['geometry'].progress_apply(distance_to_roadway,roadway = highway_utm)

Code

tqdm.pandas()
gpd_crime_city['Closest_primary'] = gpd_crime_city_utm['geometry'].progress_apply(distance_to_roadway,roadway = primary_utm)

Code

tqdm.pandas()
gpd_crime_city['Closest_secondary'] = gpd_crime_city_utm['geometry'].progress_apply(distance_to_roadway,roadway = secondary_utm)

Code

tqdm.pandas()
gpd_crime_city['Closest_tertiary'] = gpd_crime_city_utm['geometry'].progress_apply(distance_to_roadway,roadway = tertiary_utm)

Code

gpd_crime_city.head(2)

	Incident Time	Incident Year	Incident Day of Week	Report Type Code	Incident Category	Incident Subcategory	Resolution	Intersection	Police District	Analysis Neighborhood	...	lat	lon	display_name	class	type	importance	Closest_highway	Closest_primary	Closest_secondary	Closest_tertiary
9676	02:00	2020	Friday	II	Larceny Theft	Larceny Theft - Other	Open or Active	BASS CT \ WHITNEY YOUNG CIR	Bayview	Bayview Hunters Point	...	37.779026	-122.419906	San Francisco, CAL Fire Northern Region, Calif...	boundary	administrative	1.025131	1468.413426	534.950176	535.680061	506.558042
13583	20:00	2020	Saturday	II	Larceny Theft	Larceny - From Vehicle	Open or Active	CLAY ST \ LARKIN ST	Central	Russian Hill	...	37.779026	-122.419906	San Francisco, CAL Fire Northern Region, Calif...	boundary	administrative	1.025131	2149.153684	390.024468	285.145981	97.658841

2 rows × 36 columns

Code

# based on the closest distance from point to each type of roads to ensure the road type where the crime point locate
def get_category(row):
    if row['Closest_highway'] < 10:
        return 'highway'
    elif row['Closest_primary'] < 10:
        return 'primary'
    elif row['Closest_secondary'] < 10:
        return 'secondary'
    elif row['Closest_tertiary'] < 10:
        return 'tertiary'
    else:
        return 'residential'
gpd_crime_city['Crime_Location'] = gpd_crime_city.apply(get_category, axis=1)

Code

gpd_crime_city.head(2)

	Incident Time	Incident Year	Incident Day of Week	Report Type Code	Incident Category	Incident Subcategory	Resolution	Intersection	Police District	Analysis Neighborhood	...	lon	display_name	class	type	importance	Closest_highway	Closest_primary	Closest_secondary	Closest_tertiary	Crime_Location
9676	02:00	2020	Friday	II	Larceny Theft	Larceny Theft - Other	Open or Active	BASS CT \ WHITNEY YOUNG CIR	Bayview	Bayview Hunters Point	...	-122.419906	San Francisco, CAL Fire Northern Region, Calif...	boundary	administrative	1.025131	1468.413426	534.950176	535.680061	506.558042	residential
13583	20:00	2020	Saturday	II	Larceny Theft	Larceny - From Vehicle	Open or Active	CLAY ST \ LARKIN ST	Central	Russian Hill	...	-122.419906	San Francisco, CAL Fire Northern Region, Calif...	boundary	administrative	1.025131	2149.153684	390.024468	285.145981	97.658841	residential

2 rows × 37 columns

Code

gpd_crime_city.Crime_Location.value_counts()

residential    10518
secondary       8670
tertiary        6382
primary         2520
highway          123
Name: Crime_Location, dtype: int64

Zoning

Code

#https://drive.google.com/file/d/1sremJO16LufbhJdXzSiAViXUmtJCieCN/view?usp=share_link
id = "1sremJO16LufbhJdXzSiAViXUmtJCieCN"
file = drive.CreateFile({'id':id}) 
file.GetContentFile('SF_Zoning.csv')

Code

# load zoning data
zoning_df = pd.read_csv('SF_Zoning.csv')
zoning_df.head()

	the_geom	zoning_sim	districtname	url	gen	zoning	codesection
0	MULTIPOLYGON (((-122.41533105377357 37.7723137...	RED-MX	RESIDENTIAL ENCLAVE-MIXED	https://codelibrary.amlegal.com/codes/san_fran...	Mixed Use	RED-MX	847
1	MULTIPOLYGON (((-122.38902391194917 37.7381094...	NCD	BAYVIEW NEIGHBORHOOD COMMERCIAL DISTRICT	https://codelibrary.amlegal.com/codes/san_fran...	Mixed Use	NCD-BAYVIEW	737
2	MULTIPOLYGON (((-122.389357706792 37.738606785...	NCT-3	MODERATE SCALE NEIGHBORHOOD COMMERCIAL TRANSIT...	https://codelibrary.amlegal.com/codes/san_fran...	Mixed Use	NCT-3	752
3	MULTIPOLYGON (((-122.41533105377357 37.7723137...	RED	RESIDENTIAL ENCLAVE	https://codelibrary.amlegal.com/codes/san_fran...	Residential	RED	813
4	MULTIPOLYGON (((-122.38155774047206 37.7381229...	PDR-1-B	PDR LIGHT INDUSTRIAL BUFFER	https://codelibrary.amlegal.com/codes/san_fran...	Industrial	PDR-1-B	210.3

Code

from shapely.wkt import loads

Code

zoning_df['geometry'] = zoning_df['the_geom'].apply(loads)

# Create a GeoPandas DataFrame from the Pandas DataFrame
zoning_gdf = gpd.GeoDataFrame(zoning_df, crs='EPSG:4326', geometry='geometry')

Code

zoning_gdf.head()

Code

#drop irrelevant data
drop = ['zoning_sim','url','zoning','codesection']
zoning_new = zoning_gdf.drop(columns=drop)
zoning_new.head()

Code

#brief visualization 
fig, ax = plt.subplots(figsize=(6, 4))
zoning_new.to_crs(epsg=3857).plot(ax = ax,
                figsize=(12,12),
                markersize=40,
               color="black",
               edgecolor="white",
               alpha=0.8,
               marker="o"
            );
ctx.add_basemap(ax, source=ctx.providers.OpenStreetMap.Mapnik)

Code

#spatial join zoning data and crime data
gpd_crime_city_zone = gpd.sjoin(gpd_crime_city, zoning_new, op='within')
gpd_crime_city_zone.head(2)

Building

Code

#https://drive.google.com/file/d/1-7NryT-DamfZnKO7mU9ZqZsISg5TInsC/view?usp=share_link
id = "1-7NryT-DamfZnKO7mU9ZqZsISg5TInsC"
file = drive.CreateFile({'id':id}) 
file.GetContentFile('sf_building.csv')

Code

#load building footprints data
building_df = pd.read_csv('sf_building.csv')
building_df.head()

	sf16_bldgid	area_id	mblr	p2010_name	p2010_zminn88ft	p2010_zmaxn88ft	gnd_cells50cm	gnd_mincm	gnd_maxcm	gnd_rangecm	...	hgt_majoritycm	hgt_minoritycm	hgt_mediancm	gnd_min_m	median_1st_m	hgt_median_m	gnd1st_delta	peak_1st_m	globalid	shape
0	201006.000000	1	SF4570025	SanfranF_4606.flt	16.3249	66.2671	178250	507	704	197	...	813	349	850	5.07	14.16	8.50	9.09	23.85	{CF7EF595-68E6-4950-B361-CC82D77383A0}	MULTIPOLYGON (((-122.37950387699999 37.7397994...
1	201006.000000	2	SM005050270	SanfranI_09417.flt	81.7802	214.1874	144111	2515	3419	904	...	737	11	735	25.15	39.25	7.35	14.10	66.94	{806F8EDB-7B25-4054-898E-6F2CF65D072E}	MULTIPOLYGON (((-122.41894297800002 37.7076482...
2	201006.000000	3	SF3794028	Sanfran_Orig_1330.flt	12.5027	102.5737	115295	164	507	343	...	734	-86	1157	1.64	15.29	11.57	13.65	59.18	{F0F234CF-3F0B-439D-B59A-8899C2135F23}	MULTIPOLYGON (((-122.38881976200003 37.7792352...
3	201006.003228	32278	SM004154340	SanfranP_3364.flt	342.5260	381.3803	708	10454	10532	78	...	521	140	456	104.54	109.39	4.56	4.85	111.79	{C321EB73-7E6E-49B5-822B-3E7DD3A7F0AD}	MULTIPOLYGON (((-122.45552821000001 37.7074943...
4	201006.000000	4	SF7295021	NaN	NaN	NaN	107634	4346	5185	839	...	1078	16	1153	43.46	61.58	11.53	18.12	69.72	{C52C089C-B0A6-4716-8358-C568A65861DC}	MULTIPOLYGON (((-122.47734992300002 37.7289024...

5 rows × 43 columns

Code

##keep the geometry and height data
building_df = building_df[['peak_1st_m','shape']]
building_df.head(2)

Code

building_df['geometry'] = building_df['shape'].apply(loads)
# Create a GeoPandas DataFrame from the Pandas DataFrame
building_gdf = gpd.GeoDataFrame(building_df, crs='EPSG:4326', geometry='geometry')

Code

building_gdf.head(2)

Code

# Calculate the area of each building
building_gdf['area'] = building_gdf['geometry'].area
building_gdf.head(2)

Code

# turan the form of data from polygon to point for find nearest point
point_gdf = building_gdf.copy()
point_gdf["geometry"] = building_gdf.centroid
point_gdf.head(2)

Code

gpd_crime_city_zone = gpd_crime_city_zone.rename(columns={'index_left': 'left'})
gpd_crime_city_zone = gpd_crime_city_zone.rename(columns={'index_right': 'right'})

Code

gpd_crime_city_zone.head(2)

Code

from google.colab import drive
drive.mount('/content/drive')
#save_path = '/content/drive/MyDrive/Colab Notebooks/data/gpd_crime_city_zone.csv'
#df.to_csv(save_path, index=False)

Mounted at /content/drive

Code

#https://drive.google.com/file/d/1WkxWvKEsSpq50x-BEasGqmsgu6pi91_m/view?usp=share_link
id = "1WkxWvKEsSpq50x-BEasGqmsgu6pi91_m"
file = drive.CreateFile({'id':id}) 
file.GetContentFile('gpd_crime_city_zone.csv')

Code

gpd_crime_city_zone = pd.read_csv('gpd_crime_city_zone.csv')
gpd_crime_city_zone.head()

	Incident Time	Incident Year	Incident Day of Week	Report Type Code	Incident Category	Incident Subcategory	Resolution	Intersection	Police District	Analysis Neighborhood	...	Closest_primary	Closest_secondary	Closest_tertiary	Crime_Location	right	the_geom	districtname	gen	buffer_5m	buffer
0	02:00	2020	Friday	II	Larceny Theft	Larceny Theft - Other	Open or Active	BASS CT \ WHITNEY YOUNG CIR	Bayview	Bayview Hunters Point	...	534.950176	535.680061	506.558042	residential	815	MULTIPOLYGON (((-122.38222192626027 37.7365910...	RESIDENTIAL- HOUSE, TWO FAMILY	Residential	POLYGON ((-117.38469170531177 37.7337597458953...	POLYGON ((-117.38469170531177 37.7337597458953...
1	13:59	2020	Sunday	VI	Larceny Theft	Theft From Vehicle	Open or Active	WHITNEY YOUNG CIR \ CASHMERE ST \ DEDMAN CT	Bayview	Bayview Hunters Point	...	364.965970	433.206386	203.201740	residential	815	MULTIPOLYGON (((-122.38222192626027 37.7365910...	RESIDENTIAL- HOUSE, TWO FAMILY	Residential	POLYGON ((-117.38571301129124 37.7366011278019...	POLYGON ((-117.38571301129124 37.7366011278019...
2	16:28	2020	Friday	II	Larceny Theft	Larceny Theft - Other	Open or Active	LA SALLE AVE \ OSCEOLA LN	Bayview	Bayview Hunters Point	...	833.743266	461.886008	835.429639	residential	815	MULTIPOLYGON (((-122.38222192626027 37.7365910...	RESIDENTIAL- HOUSE, TWO FAMILY	Residential	POLYGON ((-117.3819485575142 37.73152913469796...	POLYGON ((-117.3819485575142 37.73152913469796...
3	18:00	2020	Tuesday	II	Larceny Theft	Larceny - Auto Parts	Open or Active	INGALLS ST \ HARBOR RD	Bayview	Bayview Hunters Point	...	951.461748	255.279620	825.220932	residential	815	MULTIPOLYGON (((-122.38222192626027 37.7365910...	RESIDENTIAL- HOUSE, TWO FAMILY	Residential	POLYGON ((-117.3799317268322 37.73338073279184...	POLYGON ((-117.3799317268322 37.73338073279184...
4	21:00	2020	Sunday	VI	Larceny Theft	Theft From Vehicle	Open or Active	CASHMERE ST \ HUDSON AVE	Bayview	Bayview Hunters Point	...	582.547890	387.780374	420.026310	residential	815	MULTIPOLYGON (((-122.38222192626027 37.7365910...	RESIDENTIAL- HOUSE, TWO FAMILY	Residential	POLYGON ((-117.38342544658872 37.7358032525956...	POLYGON ((-117.38342544658872 37.7358032525956...

5 rows × 43 columns

Code

gpd_crime_city_zone = gpd.GeoDataFrame(gpd_crime_city_zone, geometry=gpd.points_from_xy(gpd_crime_city_zone.Longitude, gpd_crime_city_zone.Latitude), crs={'init' :'epsg:4326'})
gpd_crime_city_zone.info()

Code

from scipy.spatial import cKDTree

Code

pd_crime_city_zone = np.array(list(gpd_crime_city_zone.geometry.apply(lambda pt: (pt.x, pt.y))))
point_df = np.array(list(point_gdf.geometry.apply(lambda pt: (pt.x, pt.y))))

# Build cKDTree from crime points
tree = cKDTree(point_df)

# Query tree for nearest crime to each Boston point
dist, idx = tree.query(pd_crime_city_zone, k=1)

# Add nearest crime column to boston GeoDataFrame
gpd_crime_city_zone['builidng_nn1'] = point_gdf.loc[idx, 'peak_1st_m'].values

Code

drop = ['buffer_5m','buffer','right']
gpd_crime_city_zone = gpd_crime_city_zone.drop(columns=drop)

Code

val_dict = {'highway': 24, 'primary': 20, 'secondary': 12, 'tertiary': 8, 'residential': 6}
gpd_crime_city_zone['road_width'] = gpd_crime_city_zone['Crime_Location'].map(val_dict)

Code

gpd_crime_city_zone['rate_BS'] = gpd_crime_city_zone['builidng_nn1']/gpd_crime_city_zone['road_width']

Census tract

load basic census tract boundary

Code

#https://drive.google.com/file/d/1DbA90nYecBQnwRkcqUAAP8vT7OQP9MWz/view?usp=share_link
id = "1DbA90nYecBQnwRkcqUAAP8vT7OQP9MWz"
file = drive.CreateFile({'id':id}) 
file.GetContentFile('sf_centra.csv')

Code

ct_df = pd.read_csv('sf_centra.csv')
ct_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 176 entries, 0 to 175
Data columns (total 47 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   the_geom    176 non-null    object 
 1   OBJECTID    176 non-null    int64  
 2   FIPSSTCO    176 non-null    int64  
 3   TRT2000     176 non-null    int64  
 4   STFID       176 non-null    int64  
 5   TRACTID     176 non-null    float64
 6   STATE       176 non-null    int64  
 7   COUNTY      176 non-null    int64  
 8   TRACT       176 non-null    int64  
 9   POP2000     176 non-null    int64  
 10  WHITE       176 non-null    int64  
 11  BLACK       176 non-null    int64  
 12  AMERI_ES    176 non-null    int64  
 13  ASIAN       176 non-null    int64  
 14  HAWN_PI     176 non-null    int64  
 15  OTHER       176 non-null    int64  
 16  MULT_RACE   176 non-null    int64  
 17  HISPANIC    176 non-null    int64  
 18  MALES       176 non-null    int64  
 19  FEMALES     176 non-null    int64  
 20  AGE_UNDER5  176 non-null    int64  
 21  AGE_5_17    176 non-null    int64  
 22  AGE_18_21   176 non-null    int64  
 23  AGE_22_29   176 non-null    int64  
 24  AGE_30_39   176 non-null    int64  
 25  AGE_40_49   176 non-null    int64  
 26  AGE_50_64   176 non-null    int64  
 27  AGE_65_UP   176 non-null    int64  
 28  MED_AGE     176 non-null    float64
 29  MED_AGE_M   176 non-null    float64
 30  MED_AGE_F   176 non-null    float64
 31  HOUSEHOLDS  176 non-null    int64  
 32  AVE_HH_SZ   176 non-null    float64
 33  HSEHLD_1_M  176 non-null    int64  
 34  HSEHLD_1_F  176 non-null    int64  
 35  MARHH_CHD   176 non-null    int64  
 36  MARHH_NO_C  176 non-null    int64  
 37  MHH_CHILD   176 non-null    int64  
 38  FHH_CHILD   176 non-null    int64  
 39  FAMILIES    176 non-null    int64  
 40  AVE_FAM_SZ  176 non-null    float64
 41  HSE_UNITS   176 non-null    int64  
 42  URBAN       176 non-null    int64  
 43  RURAL       176 non-null    int64  
 44  VACANT      176 non-null    int64  
 45  OWNER_OCC   176 non-null    int64  
 46  RENTER_OCC  176 non-null    int64  
dtypes: float64(6), int64(40), object(1)
memory usage: 64.8+ KB

Code

ct_df['TRACTID'] = ct_df['TRACTID'].astype('object')

load social-economic data based on the census tract

Code

import censusdata
from census import Census
from us import states

Code

# Define the Census API key
censusdata.censuskey = "93db71b5bb2f994846b61d1cb53de6b204ed41b8"
c = Census("93db71b5bb2f994846b61d1cb53de6b204ed41b8")
year = 2020
state_fips = '06'  # California
county_fips = '075'  # San Francisco County

#   B07010_004 -- Estimate!!Total:!!With income:!!$1 to $9,999 or loss -- poverty indicator
# B06009_002 -- Estimate!!Total:!!Less than high school graduate
# B09010_002 -- Estimate!!Total:!!Living in household with Supplemental Security Income (SSI), cash public assistance income, or Food Stamps/SNAP in the past 12 months:
# B27011_008 -- Estimate!!Total:!!In labor force:!!Unemployed:
# B11013_002 -- Estimate!!Total:!!Married-couple subfamily
# B11013_005 -- Estimate!!Total:!!Mother-child subfamily
# B11013_006 -- Estimate!!Total:!!Father-child subfamily
va_census = c.acs5.state_county_tract(fields = ('NAME','B07010_004E','B06009_002E', 'B09010_002E', 'B27011_008E','B11014_005E','B11014_006E','B11014_009E'),
                                      state_fips = '06',
                                      county_fips = '075',
                                      tract = "*",
                                      year = 2020)

Code

sfp_df = pd.DataFrame(va_census)
sfp_df.head(2)

Code

sfp_df['tract'] = sfp_df['NAME'].str.extract(r'Census Tract (\d+\.\d+)', expand=False)

Code

sfp_df.head(2)

Code

sfp_df = sfp_df.rename(columns={'B07010_004E': 'low_income', 'B06009_002E':'low_education','B09010_002E':'fd_secure','B27011_008E':'unemploy','B11014_005E':'couple_fa',
                                'B11014_006E':'ma-child','B11014_009E':'fa-child'})
sfp_df.head(2)

Code

sfp_df = sfp_df.rename(columns={'tract': 'TRACTID'})
sfp_df.head(2)

Code

socieco_df = pd.merge(ct_df, sfp_df, on='TRACTID',how='left')

Code

ct_df['geometry'] = ct_df['the_geom'].apply(loads)
# Create a GeoPandas DataFrame from the Pandas DataFrame
ct_gdf = gpd.GeoDataFrame(ct_df, crs='EPSG:4326', geometry='geometry')

Data Analysis and Visualization

Code

# the data that crime with place's social-economic type
gpd_crime_fnl = gpd.sjoin(gpd_crime_city_zone, ct_gdf, op='within')

Code

pd.set_option('display.max_columns', None)

Code

gpd_crime_fnl.head(2)

	Incident Time	Incident Year	Incident Day of Week	Report Type Code	Incident Category	Incident Subcategory	Resolution	Intersection	Police District	Analysis Neighborhood	Supervisor District 2012	Latitude	Longitude	Point	Neighborhoods	ESNCAG - Boundary File	geometry	year	month	day	hour	minute	Result	place_id	osm_type	osm_id	lat	lon	display_name	class	type	importance	Closest_highway	Closest_primary	Closest_secondary	Closest_tertiary	Crime_Location	the_geom_left	districtname	gen	builidng_nn1	road_width	rate_BS	index_right	the_geom_right	OBJECTID	FIPSSTCO	TRT2000	STFID	TRACTID	STATE	COUNTY	TRACT	POP2000	WHITE	BLACK	AMERI_ES	ASIAN	HAWN_PI	OTHER	MULT_RACE	HISPANIC	MALES	FEMALES	AGE_UNDER5	AGE_5_17	AGE_18_21	AGE_22_29	AGE_30_39	AGE_40_49	AGE_50_64	AGE_65_UP	MED_AGE	MED_AGE_M	MED_AGE_F	HOUSEHOLDS	AVE_HH_SZ	HSEHLD_1_M	HSEHLD_1_F	MARHH_CHD	MARHH_NO_C	MHH_CHILD	FHH_CHILD	FAMILIES	AVE_FAM_SZ	HSE_UNITS	URBAN	RURAL	VACANT	OWNER_OCC	RENTER_OCC
0	02:00	2020	Friday	II	Larceny Theft	Larceny Theft - Other	Open or Active	BASS CT \ WHITNEY YOUNG CIR	Bayview	Bayview Hunters Point	10.0	37.733760	-122.384692	POINT (-122.38469170531177 37.73375974589532)	86.0	NaN	POINT (-122.38469 37.73376)	2020	4	10	2	0	0	297983890	relation	111968	37.779026	-122.419906	San Francisco, CAL Fire Northern Region, Calif...	boundary	administrative	1.025131	1468.413426	534.950176	535.680061	506.558042	residential	MULTIPOLYGON (((-122.38222192626027 37.7365910...	RESIDENTIAL- HOUSE, TWO FAMILY	Residential	18.58	6	3.096667	42	MULTIPOLYGON (((-122.38155725680222 37.7381228...	239	6075	23102	6075023102	231.02	6	75	23102	3600	230	2625	7	421	29	148	140	321	1637	1963	262	957	180	340	559	504	482	316	31.0	26.7	34.5	1259	2.86	111	203	171	117	24	307	876	3.4	1289	0	0	30	456	803
1	13:59	2020	Sunday	VI	Larceny Theft	Theft From Vehicle	Open or Active	WHITNEY YOUNG CIR \ CASHMERE ST \ DEDMAN CT	Bayview	Bayview Hunters Point	10.0	37.736601	-122.385713	POINT (-122.38571301129124 37.7366011278019)	86.0	NaN	POINT (-122.38571 37.73660)	2020	2	9	13	59	0	297983890	relation	111968	37.779026	-122.419906	San Francisco, CAL Fire Northern Region, Calif...	boundary	administrative	1.025131	1194.113022	364.965970	433.206386	203.201740	residential	MULTIPOLYGON (((-122.38222192626027 37.7365910...	RESIDENTIAL- HOUSE, TWO FAMILY	Residential	18.58	6	3.096667	42	MULTIPOLYGON (((-122.38155725680222 37.7381228...	239	6075	23102	6075023102	231.02	6	75	23102	3600	230	2625	7	421	29	148	140	321	1637	1963	262	957	180	340	559	504	482	316	31.0	26.7	34.5	1259	2.86	111	203	171	117	24	307	876	3.4	1289	0	0	30	456	803

Code

gpd_crime_fnl['gen'].value_counts()

Code

def get_category(row):
    if row['gen'] == 'Residential':
        return 1
    elif row['gen'] == 'Mixed Use':
        return 2
    elif row['gen'] == 'Commercial':
        return 3
    elif row['gen'] == 'Public':
        return 4
    else:
        return 5
gpd_crime_fnl['ztype'] = gpd_crime_fnl.apply(get_category, axis=1)

Code

gpd_crime_fnl['Crime_Location'].value_counts()

Code

def get_category(row):
    if row['Crime_Location'] == 'residential':
        return 1
    elif row['Crime_Location'] == 'secondary':
        return 2
    elif row['Crime_Location'] == 'tertiary':
        return 3
    elif row['Crime_Location'] == 'primary':
        return 4
    else:
        return 5
gpd_crime_fnl['rtype'] = gpd_crime_fnl.apply(get_category, axis=1)

Code

# Numerical Features
numerical = ['Result','Neighborhoods', 'Supervisor District 2012', 'month', 'day', 'hour', 'minute', 'Closest_highway', 'Closest_primary', 'Closest_secondary', 'Closest_tertiary', 
              'road_width', 'rate_BS', 'POP2000','Longitude', 'Latitude', 'MALES', 'FEMALES','AGE_UNDER5','AGE_5_17','AGE_18_21','AGE_22_29','AGE_30_39','AGE_40_49',
             'AGE_50_64','AGE_65_UP','MED_AGE','MED_AGE_M','MED_AGE_F','HOUSEHOLDS','AVE_HH_SZ','HSEHLD_1_M','HSEHLD_1_F','MARHH_CHD','MARHH_NO_C','MHH_CHILD','FHH_CHILD','FAMILIES',
             'AVE_FAM_SZ','HSE_UNITS','VACANT','OWNER_OCC','RENTER_OCC','rtype','ztype']

# Categorical Features 
categorical = ['Incident Day of Week', 'Report Type Code', 'Crime_Location','gen']

Code

df_vs = gpd_crime_fnl.copy()

Code

f, ax = plt.subplots(figsize= [20,15])
sns.heatmap(df_vs[numerical].corr(), annot=True, fmt=".2f", ax=ax, cmap = "magma" )
ax.set_title("Correlation Matrix", fontsize=20)
plt.show()

Code

Corr = pd.DataFrame(df_vs[numerical].corr()['Result'].sort_values(ascending = False))
Corr = Corr.iloc[1:,:]
Corr.columns = ['Targer Correlation']
Corr

	Targer Correlation
minute	0.125436
MED_AGE_M	0.103229
MED_AGE	0.082230
Longitude	0.082165
ztype	0.080871
Closest_primary	0.071255
VACANT	0.070051
Latitude	0.059627
MED_AGE_F	0.049638
road_width	0.039319
rtype	0.038533
HSEHLD_1_M	0.025309
RENTER_OCC	0.001603
day	-0.000748
AGE_18_21	-0.001998
hour	-0.005705
AGE_65_UP	-0.006762
AVE_FAM_SZ	-0.012245
AGE_50_64	-0.017518
AGE_40_49	-0.024185
HSE_UNITS	-0.026224
month	-0.026534
Closest_highway	-0.026694
MALES	-0.027852
HSEHLD_1_F	-0.028811
FHH_CHILD	-0.031284
HOUSEHOLDS	-0.034578
AGE_5_17	-0.037363
Neighborhoods	-0.037629
MHH_CHILD	-0.041164
MARHH_NO_C	-0.041853
MARHH_CHD	-0.042327
Supervisor District 2012	-0.042750
AVE_HH_SZ	-0.044648
POP2000	-0.044909
AGE_UNDER5	-0.047650
FAMILIES	-0.047962
AGE_22_29	-0.054750
Closest_tertiary	-0.056600
FEMALES	-0.060230
rate_BS	-0.065186
AGE_30_39	-0.068832
Closest_secondary	-0.070957
OWNER_OCC	-0.078604

Code

g0 = sns.barplot(x="Targer Correlation", y=Corr.index, data=Corr)
g0.figure.set_size_inches(12, 9)

Code

gpd_crime_vs = gpd_crime_fnl.copy()

Code

plt.figure(figsize = (11, 10))
plt.scatter(gpd_crime_vs.Longitude, gpd_crime_vs.Latitude, s=5, c = gpd_crime_vs.hour)
plt.colorbar(); plt.xlabel('Longitude', fontsize=18); plt.ylabel('Latitude', fontsize=18)

Text(0, 0.5, 'Latitude')

Code

plt.figure(figsize = (11, 10))
plt.scatter(gpd_crime_vs.Longitude, gpd_crime_vs.Latitude, s=5, c = gpd_crime_vs.Result)
plt.colorbar(); plt.xlabel('Longitude', fontsize=18); plt.ylabel('Latitude', fontsize=18)

Text(0, 0.5, 'Latitude')

Code

_,axss = plt.subplots(2,3, figsize = [20,8])
sns.countplot(x = 'Crime_Location', data = gpd_crime_vs,ax=axss[0][0])
sns.countplot(x = 'gen', data = gpd_crime_vs,ax=axss[0][1])
sns.countplot(x = 'Result', hue = gpd_crime_vs['gen'], data = gpd_crime_vs,ax=axss[0][2])
sns.countplot(x = 'Result', hue = gpd_crime_vs['Crime_Location'], data = gpd_crime_vs,ax=axss[1][0])
sns.countplot(x = 'Result', hue = gpd_crime_vs['Incident Day of Week'], data = gpd_crime_vs,ax=axss[1][1])
sns.countplot(x = 'Result', hue = gpd_crime_vs['Incident Subcategory'], data = gpd_crime_vs,ax=axss[1][2])

<Axes: xlabel='Result', ylabel='count'>

Code

_,axss = plt.subplots(2,3, figsize = [20,10])
sns.boxplot(x='Result',y='AGE_30_39',data=gpd_crime_vs,ax=axss[0][0])
sns.boxplot(x='Result',y='FEMALES',data=gpd_crime_vs,ax=axss[0][1])
sns.boxplot(x='Result',y='AGE_22_29',data=gpd_crime_vs,ax=axss[0][2])
sns.boxplot(x='Result',y='OWNER_OCC',data=gpd_crime_vs,ax=axss[1][0])
sns.boxplot(x='Result',y='MED_AGE_M',data=gpd_crime_vs,ax=axss[1][1])
sns.boxplot(x='Result',y='VACANT',data=gpd_crime_vs,ax=axss[1][2])

<Axes: xlabel='Result', ylabel='VACANT'>

Code

_,axss = plt.subplots(2,2, figsize = [20,10])
sns.boxplot(x='Result',y='Closest_secondary',data=gpd_crime_vs,ax=axss[0][0])
sns.boxplot(x='Result',y='Closest_tertiary',data=gpd_crime_vs,ax=axss[0][1])
sns.boxplot(x='Result',y='Closest_primary',data=gpd_crime_vs,ax=axss[1][0])
sns.boxplot(x='Result',y='road_width',data=gpd_crime_vs,ax=axss[1][1])

<Axes: xlabel='Result', ylabel='road_width'>

Code

dropy = ['Incident Time','Incident Year','Incident Day of Week','Report Type Code','Incident Category','Incident Subcategory','Resolution','Intersection','Police District','Analysis Neighborhood',
        'Supervisor District 2012','Latitude','Longitude','Point','ESNCAG - Boundary File','geometry','year','minute','place_id','osm_type','osm_id','lat','lon','display_name',
        'class','type','Crime_Location','the_geom_left','districtname','gen','index_right','the_geom_right','OBJECTID','FIPSSTCO','TRT2000','STFID','TRACTID','STATE','COUNTY',
        'TRACT']
crime_md = gpd_crime_fnl.drop(dropy, axis = 1)

Code

crime_md.dtypes

Neighborhoods        float64
month                  int64
day                    int64
hour                   int64
Result                 int64
importance           float64
Closest_highway      float64
Closest_primary      float64
Closest_secondary    float64
Closest_tertiary     float64
builidng_nn1         float64
road_width             int64
rate_BS              float64
POP2000                int64
WHITE                  int64
BLACK                  int64
AMERI_ES               int64
ASIAN                  int64
HAWN_PI                int64
OTHER                  int64
MULT_RACE              int64
HISPANIC               int64
MALES                  int64
FEMALES                int64
AGE_UNDER5             int64
AGE_5_17               int64
AGE_18_21              int64
AGE_22_29              int64
AGE_30_39              int64
AGE_40_49              int64
AGE_50_64              int64
AGE_65_UP              int64
MED_AGE              float64
MED_AGE_M            float64
MED_AGE_F            float64
HOUSEHOLDS             int64
AVE_HH_SZ            float64
HSEHLD_1_M             int64
HSEHLD_1_F             int64
MARHH_CHD              int64
MARHH_NO_C             int64
MHH_CHILD              int64
FHH_CHILD              int64
FAMILIES               int64
AVE_FAM_SZ           float64
HSE_UNITS              int64
URBAN                  int64
RURAL                  int64
VACANT                 int64
OWNER_OCC              int64
RENTER_OCC             int64
rtype                  int64
ztype                  int64
dtype: object

Model Construction

Code

crime_md.fillna(0, inplace=True)

Code

X = crime_md.drop(['Result'],axis = 1)
y = crime_md['Result']

Code

from sklearn import model_selection as mdslct
X_train, X_test, y_train, y_test = mdslct.train_test_split(X, y, test_size = 0.25, stratify= y, random_state= 1)

print('training data has ' + str(X_train.shape[0]) + ' observation with ' + str(X_train.shape[1]) + ' features')
print('test data has ' + str(X_test.shape[0]) + ' observation with ' + str(X_test.shape[1]) + ' features')

training data has 21165 observation with 52 features
test data has 7055 observation with 52 features

Code

from sklearn.preprocessing import StandardScaler
scale = StandardScaler()
scale.fit(X_train)
X_train = scale.transform(X_train)
X_test = scale.transform(X_test)

Code

from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier 
from sklearn.linear_model import LogisticRegression

classifier_rd = RandomForestClassifier()
classifier_KNN = KNeighborsClassifier()
classifier_lr = LogisticRegression()

Code

classifier_rd.fit(X_train,y_train)
classifier_rd.predict(X_test)
classifier_rd.score(X_test,y_test)

Code

classifier_KNN.fit(X_train,y_train)
classifier_KNN.predict(X_test)
classifier_KNN.score(X_test,y_test)

Code

classifier_lr.fit(X_train,y_train,sample_weight=1)
classifier_lr.predict(X_test)
classifier_lr.score(X_test,y_test)

Code

from sklearn.model_selection import GridSearchCV
def print_grid_search_metrics(gs):
  print("best_score =" + str(gs.best_score_))
  print('best_parameter set:')
  best_parameter = gs.best_params_
  for param_name in sorted(best_parameter.keys()):
    print(param_name + ':' + str(best_parameter[param_name]))

Code

#to find the optimal hyperparamater
#penalty is the way of regularization, C is the weight for penalty
parameters = {
    'penalty':('l2','l1'), 
    'C':(0.01, 0.05, 0.1, 0.2, 1,1.1,1.2)
}
Grid_LR = GridSearchCV(LogisticRegression(solver='liblinear'),parameters, cv=5)
Grid_LR.fit(X_train, y_train)

GridSearchCV(cv=5, estimator=LogisticRegression(solver='liblinear'),
             param_grid={'C': (0.01, 0.05, 0.1, 0.2, 1, 1.1, 1.2),
                         'penalty': ('l2', 'l1')})

In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.

Code

print(print_grid_search_metrics(Grid_LR))

best_score =0.9539333805811481
best_parameter set:
C:0.01
penalty:l2
None

Code

best_LR_model = Grid_LR.best_estimator_

Code

best_LR_model.predict(X_test)

array([0, 0, 0, ..., 0, 0, 0])

Code

best_LR_model.score(X_test,y_test)

0.9539333805811481

Code

# optimal hyperparameter for KNN
parameters = {
    'n_neighbors':[1,3,5,7]
} #the name of parameters should match that in KNN
Grid_KNN = GridSearchCV(KNeighborsClassifier(),parameters,cv=5)
Grid_KNN.fit(X_train, y_train)

GridSearchCV(cv=5, estimator=KNeighborsClassifier(),
             param_grid={'n_neighbors': [1, 3, 5, 7]})

Code

print(print_grid_search_metrics(Grid_KNN))

best_score =0.9526104417670682
best_parameter set:
n_neighbors:7
None

Code

best_KNN_model = Grid_KNN.best_estimator_

Code

best_KNN_model.predict(X_test)

array([0, 0, 0, ..., 0, 0, 0])

Code

best_KNN_model.score(X_test,y_test)

0.9523742026931254

Code

# optimal hyperparameter for Random Forest
parameters = {
    'n_estimators' : [50,60,80,100],
    'max_depth': [1,5,8,10]
}
Grid_RM = GridSearchCV(RandomForestClassifier(),parameters,cv=5)
Grid_RM.fit(X_train,y_train)

GridSearchCV(cv=5, estimator=RandomForestClassifier(),
             param_grid={'max_depth': [1, 5, 8, 10],
                         'n_estimators': [50, 60, 80, 100]})

Code

print(print_grid_search_metrics(Grid_RM))

best_score =0.9581384360973304
best_parameter set:
max_depth:10
n_estimators:80
None

Code

best_RM_model = Grid_RM.best_estimator_

Code

best_RM_model.predict(X_test)

array([0, 0, 0, ..., 0, 0, 0])

Code

best_RM_model.score(X_test,y_test)

0.9587526576895818

Code

from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

# calculate accuracy, precision and recall, [[tn, fp],[]]
def cal_evaluation(classifier, cm):
    tn = cm[0][0]
    fp = cm[0][1]
    fn = cm[1][0]
    tp = cm[1][1]
    accuracy = (tp+ tn)/(tp + tn + fp + fn + 0.0)
    precision = tp/(tp + fp + 0.0)
    recall = tp/(tp + fn + 0.0)
    print (classifier)
    print ("Accuracy is: " + str(accuracy))
    print ("precision is: " + str(precision))
    print ("recall is: " + str(recall))
    print ()

# print out confusion matrices
def draw_confusion_matrices(confusion_matricies):
    class_names = ['Not','Churn']
    for cm in confusion_matrices:
        classifier, cm = cm[0], cm[1]
        cal_evaluation(classifier, cm)

Code

confusion_matrices = [
    ("Random Forest", confusion_matrix(y_test,best_RM_model.predict(X_test))),
    ("Logistic Regression", confusion_matrix(y_test,best_LR_model.predict(X_test))),
    ("K nearest neighbor", confusion_matrix(y_test, best_KNN_model.predict(X_test)))
]

draw_confusion_matrices(confusion_matrices)

Random Forest
Accuracy is: 0.9587526576895818
precision is: 0.7931034482758621
recall is: 0.14153846153846153

Logistic Regression
Accuracy is: 0.9539333805811481
precision is: nan
recall is: 0.0

K nearest neighbor
Accuracy is: 0.9523742026931254
precision is: 0.44554455445544555
recall is: 0.13846153846153847

RuntimeWarning: invalid value encountered in true_divide
  precision = tp/(tp + fp + 0.0)

Code

from sklearn.metrics import roc_curve
from sklearn import metrics
#ROC for Random Forest
y_pred_rm = best_RM_model.predict_proba(X_test)[:, 1]
fpr_rf, tpr_rf, _ = roc_curve(y_test, y_pred_rm)

Code

best_RM_model.predict_proba(X_test)

array([[0.9804783 , 0.0195217 ],
       [0.96240263, 0.03759737],
       [0.98051001, 0.01948999],
       ...,
       [0.93483543, 0.06516457],
       [0.97694558, 0.02305442],
       [0.97324386, 0.02675614]])

Code

# ROC curve of Random Forest result
import matplotlib.pyplot as plt
plt.figure(1)
plt.plot([0, 1], [0, 1], 'k--')
plt.plot(fpr_rf, tpr_rf, label='RF')
plt.xlabel('False positive rate')
plt.ylabel('True positive rate')
plt.title('ROC curve - RF model')
plt.legend(loc='best')
plt.show()

Code

from sklearn import metrics

# AUC score
metrics.auc(fpr_rf,tpr_rf)

0.7818637558578123

Code

#ROC for Logistic Regression 
y_pred_lr = best_LR_model.predict_proba(X_test)[:, 1]
fpr_rf, tpr_rf, _ = roc_curve(y_test, y_pred_lr)

Code

best_LR_model.predict_proba(X_test)

array([[0.96862073, 0.03137927],
       [0.96606054, 0.03393946],
       [0.95716253, 0.04283747],
       ...,
       [0.94351064, 0.05648936],
       [0.92430863, 0.07569137],
       [0.92842141, 0.07157859]])

Code

plt.figure(1)
plt.plot([0, 1], [0, 1], 'k--')
plt.plot(fpr_rf, tpr_rf, label='RF')
plt.xlabel('False positive rate')
plt.ylabel('True positive rate')
plt.title('ROC curve - LR model')
plt.legend(loc='best')
plt.show()

Code

metrics.auc(fpr_rf,tpr_rf)

0.7121924791404731

Code

x_corr = X.copy()
scaler = StandardScaler()
X_L1 = scaler.fit_transform(x_corr)
LRmodel_l1 = LogisticRegression(penalty="l1", C = 0.04, solver='liblinear')
LRmodel_l1.fit(X_L1,y)

indices = np.argsort(abs(LRmodel_l1.coef_[0]))[::-1]

print ("Logistic Regression (L1) Coefficients")
for ind in range(x_corr.shape[1]):
  print ("{0} : {1}".format(x_corr.columns[indices[ind]],round(LRmodel_l1.coef_[0][indices[ind]], 4)))

Logistic Regression (L1) Coefficients
MED_AGE_M : 0.3342
WHITE : -0.2047
Closest_secondary : -0.2016
Closest_primary : 0.1925
Closest_tertiary : -0.1874
rate_BS : -0.1429
ztype : 0.123
HAWN_PI : 0.1183
AGE_18_21 : 0.1173
OTHER : 0.1098
VACANT : 0.1073
ASIAN : 0.0678
AGE_65_UP : -0.0657
month : -0.0526
hour : -0.0477
OWNER_OCC : -0.0447
HSEHLD_1_F : -0.0366
Neighborhoods : -0.0362
MULT_RACE : -0.0351
HSEHLD_1_M : -0.0249
Closest_highway : 0.0211
rtype : 0.0193
BLACK : -0.0161
RENTER_OCC : 0.0
importance : 0.0
builidng_nn1 : 0.0
road_width : 0.0
RURAL : 0.0
POP2000 : 0.0
URBAN : 0.0
AMERI_ES : 0.0
day : 0.0
HSE_UNITS : 0.0
FAMILIES : 0.0
HISPANIC : 0.0
FHH_CHILD : 0.0
MHH_CHILD : 0.0
MARHH_NO_C : 0.0
MARHH_CHD : 0.0
AVE_HH_SZ : 0.0
HOUSEHOLDS : 0.0
MED_AGE_F : 0.0
AVE_FAM_SZ : 0.0
MED_AGE : 0.0
AGE_50_64 : 0.0
AGE_40_49 : 0.0
AGE_30_39 : 0.0
AGE_22_29 : 0.0
AGE_5_17 : 0.0
AGE_UNDER5 : 0.0
FEMALES : 0.0
MALES : 0.0

Code

scaler = StandardScaler()
X_L2 = scaler.fit_transform(x_corr)
LRmodel_l2 = LogisticRegression(penalty="l2", C = 0.04, solver='liblinear')
LRmodel_l2.fit(X_L2,y)

indices = np.argsort(abs(LRmodel_l2.coef_[0]))[::-1]

print ("Logistic Regression (L2) Coefficients")
for ind in range(x_corr.shape[1]):
  print ("{0} : {1}".format(x_corr.columns[indices[ind]],round(LRmodel_l2.coef_[0][indices[ind]], 4)))

Logistic Regression (L2) Coefficients
MED_AGE_M : 0.3263
HSEHLD_1_M : -0.2878
WHITE : -0.2381
AGE_5_17 : -0.2108
rate_BS : -0.2069
Closest_secondary : -0.2057
ASIAN : 0.196
MARHH_NO_C : 0.1924
Closest_primary : 0.1819
HAWN_PI : 0.1792
Closest_tertiary : -0.1763
AVE_HH_SZ : -0.1724
VACANT : 0.1557
AVE_FAM_SZ : 0.1522
AGE_65_UP : -0.1417
OTHER : 0.1396
MARHH_CHD : -0.1302
AGE_UNDER5 : 0.1262
AGE_18_21 : 0.1249
MULT_RACE : -0.1247
HISPANIC : 0.121
ztype : 0.1203
AGE_40_49 : 0.1187
MHH_CHILD : -0.1183
HSEHLD_1_F : -0.1088
AMERI_ES : 0.0906
AGE_50_64 : -0.0891
Closest_highway : 0.0882
road_width : -0.0823
MED_AGE : 0.0727
hour : -0.0718
MED_AGE_F : -0.067
month : -0.0657
HSE_UNITS : 0.0609
RENTER_OCC : 0.059
Neighborhoods : -0.0553
FHH_CHILD : 0.0535
HOUSEHOLDS : 0.0487
rtype : 0.0418
AGE_22_29 : 0.0297
AGE_30_39 : 0.0292
MALES : -0.0243
POP2000 : -0.0209
OWNER_OCC : -0.0185
FEMALES : -0.0151
BLACK : 0.015
FAMILIES : -0.0093
day : -0.0067
builidng_nn1 : -0.0
URBAN : 0.0
RURAL : 0.0
importance : 0.0

Code

# check feature importance of random forest for feature selection
forest = RandomForestClassifier()
forest.fit(x_corr, y)

importances = forest.feature_importances_

indices = np.argsort(importances)[::-1]

# Print the feature ranking
print("Feature importance ranking by Random Forest Model:")
for ind in range(X.shape[1]):
  print ("{0} : {1}".format(x_corr.columns[indices[ind]],round(importances[indices[ind]], 4)))

Feature importance ranking by Random Forest Model:
day : 0.221
hour : 0.2031
month : 0.1718
Closest_primary : 0.0668
Closest_tertiary : 0.0593
Closest_secondary : 0.0564
Closest_highway : 0.0555
Neighborhoods : 0.0167
ztype : 0.0133
MED_AGE : 0.0076
rate_BS : 0.0076
road_width : 0.0074
MED_AGE_M : 0.0072
rtype : 0.0066
OWNER_OCC : 0.0053
VACANT : 0.0042
MULT_RACE : 0.004
ASIAN : 0.0037
MED_AGE_F : 0.0036
FEMALES : 0.0036
AGE_22_29 : 0.0036
AMERI_ES : 0.0035
AGE_40_49 : 0.0032
HSEHLD_1_M : 0.0031
MALES : 0.0031
POP2000 : 0.003
HISPANIC : 0.003
AGE_UNDER5 : 0.0029
HSEHLD_1_F : 0.0029
FAMILIES : 0.0029
BLACK : 0.0028
RENTER_OCC : 0.0028
HSE_UNITS : 0.0028
MARHH_NO_C : 0.0027
AGE_18_21 : 0.0026
HOUSEHOLDS : 0.0026
AGE_65_UP : 0.0026
AGE_50_64 : 0.0026
WHITE : 0.0025
OTHER : 0.0025
MARHH_CHD : 0.0025
AGE_5_17 : 0.0025
FHH_CHILD : 0.0024
AGE_30_39 : 0.0024
MHH_CHILD : 0.0023
AVE_HH_SZ : 0.0021
HAWN_PI : 0.0021
AVE_FAM_SZ : 0.0017
builidng_nn1 : 0.0
URBAN : 0.0
RURAL : 0.0
importance : 0.0