Association rule mining is a technique to identify underlying relations between different items.
In this use case for example if a restaurant offers Korean cuisines, there is a pattern in what other cusisines such restaurants offer. For instance, a Korean barbeque restaurant may also have Seafood in their cuisines list. In short as a food lover, I felt cusines between restaurants may involve a pattern and wanted to explore more on it. More patterns on what kind of food is offered can be generated if the relationship between cuisines in different restaurants can be identified.
Import the JSON file
import os
import pandas as pd
os.getcwd()
os.chdir('c:\\Users\\Nijanth Anand\\Downloads\\Apriori Problem')
#Read the JSON file into a python dataframe
df = pd.read_json('yelp_academic_dataset_business.json', lines=True)
df.head(10)
print('Size of the JSON file',df.shape)
print(df.columns)
#print(df.dtypes)
We filter out Businesses that are Restaurants using the keyword 'Restaurant Reservations'.
rest_data=df[df['attributes'].astype(str).str.contains('RestaurantsReservations')]
print('Total Number of businesses:',df.shape[0])
print('Number of Restaurants in it:',rest_data.shape[0])
#Lets see how a sample row in the restaurant data looks like
print('Sample record in the dataset')
rest_data.iloc[2]
import folium
from folium import plugins
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
#We generate a Location list from out dataset.
rest_data=rest_data.reset_index(drop=True)
locations = rest_data[['latitude', 'longitude']].reset_index(drop=True)
locationlist = locations.values.tolist()
len(locationlist)
locationlist[7]
#For complexity reasons we only plot a sample of the data.
import random
random.seed(4)
locationlist = random.sample(locationlist, 500)
print('Number of Location samples taken for below Interactive Folium map',len(locationlist))
#https://georgetsilva.github.io/posts/mapping-points-with-folium/
map = folium.Map(location=[37.0902, -95.7129], zoom_start=3.5) #Set the starting point as centre of the United States.
for point in range(0, len(locationlist)):
folium.Marker(locationlist[point], popup=rest_data['name'][point],icon=folium.Icon(color='red',icon='glass')).add_to(map)
map
Understanding the distribution of the data from different states.
rest_state_data = rest_data.groupby('state')['business_id'].nunique()
rest_state_data=rest_state_data.sort_values(ascending=False)
print('No of states',rest_state_data.size)
rest_state_data
#Generate the attributes from the categories column.
data=rest_data['categories']
#print(data.dtypes)
#We now convert the categories into items in a list so they can be used for Associate Rule Mining.
j=0
a=[]
b=[]
for i in data:
temp=str(i)
temp=temp.split(",")
#We filter out certain unwanted attributes such as Art, Casinos and generic keywords such as Restaurant, Bars etc
temp= [k for k in temp if 'Bars' not in k]
temp= [k for k in temp if 'Restaurants' not in k]
temp= [k for k in temp if 'Food' not in k]
temp= [k for k in temp if 'Nightlife' not in k]
temp= [k for k in temp if 'Beer' not in k]
temp= [k for k in temp if 'Arts' not in k]
temp= [k for k in temp if 'Events' not in k]
temp= [k for k in temp if 'Caterers' not in k]
temp= [k for k in temp if 'Hotels' not in k]
temp= [k for k in temp if 'Casinos' not in k]
temp= [k for k in temp if 'Event Planning & Services' not in k]
temp= [k for k in temp if 'Active Life' not in k]
temp= [k for k in temp if 'Art Galleries' not in k]
temp= [k for k in temp if 'Gas Stations' not in k]
temp= [k for k in temp if 'Day Spas' not in k]
temp= [k for k in temp if 'Books' not in k]
temp= [k for k in temp if ' Event Planning & Services' not in k]
temp= [k for k in temp if 'Shopping' not in k]
temp= [k for k in temp if 'Breakfast & Brunch' not in k]
temp= [k for k in temp if 'American (New)' not in k]
temp= [k for k in temp if 'Sandwiches' not in k]
temp= [k for k in temp if 'Salads' not in k]
a.append(temp)
b=b+temp
j=j+1
print('Total count of repetitive categories/cusines in the Restaurant data: ',len(b))
print('\nCusine Distribution of Top 50 by frequency: ')
fig=plt.gcf()
fig.set_size_inches(15,7)
pd.DataFrame(b)[0].value_counts().nlargest(50).plot(kind='bar')
plt.show()
from apyori import apriori
association_rules = apriori(a, min_support=0.0017, min_confidence=0.40, min_lift=1, min_length=2)
association_results = list(association_rules)
for item in association_results:
# first index of the inner list
# Contains base item and add item
pair = item[0]
items = [x for x in pair]
print("Rule: " + items[0] + " -> " + items[1])
#second index of the inner list
print("Support: " + str(item[1]))
#third index of the list located at 0th
#of the third index of the inner list
print("Confidence: " + str(item[2][0][2]))
print("Lift: " + str(item[2][0][3]))
print("=====================================")