import pandas as pd
import numpy as np
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots
from datetime import datetime
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
xls = pd.ExcelFile('KPMG_VI_New_raw_data_update_final.xlsx')
transactions = pd.read_excel(xls, 'Transactions', header=1)
demographic = pd.read_excel(xls, 'CustomerDemographic', header=1)
address = pd.read_excel(xls, 'CustomerAddress', header=1)
transactions.shape
transactions.head(5)
demographic.shape
demographic.head(5)
address.shape
address.head(5)
headerColor = '#dade9b'
rowSpecialColor = '#ffeb74'
rowColor = 'white'
nullValues = [transactions[item].isna().sum() for item in transactions.columns]
allValues = [transactions.shape[0] for item in transactions.columns]
percentages = [round((transactions[item].isna().sum() / transactions.shape[0])*100, 2) \
for item in transactions.columns]
colors = [rowSpecialColor if x == max(nullValues) else rowColor for x in nullValues]
fig = go.Figure(data=[go.Table(header=dict(values=['<b>Columns</b>','<b>NULL</b>',
'<b>All Values</b>','<b> Percentage (%)</b>'],
line_color='darkslategray', fill_color=headerColor,
align=['left', 'center'], font=dict(color='darkslategray', size=12)),
cells=dict(values=[transactions.columns, nullValues, allValues, percentages],
line_color='darkslategray',
fill_color = [colors],
align = ['left', 'center'], font=dict(color='black', size=11)))])
fig.update_layout(title='Transactions')
fig.show()
Online Order column has the most NULL values.
First, we will fill out NULL values in Online Order column with 0.0 if order status is cancelled and 1.1 if the order status is approved. This is because most of online order happened once order status is approved.
transactions.loc[transactions['order_status'].eq('Approved') & transactions['online_order'].isnull(), 'online_order'] = 1.0
transactions.loc[transactions['order_status'].eq('Cancelled') & transactions['online_order'].isnull(), 'online_order'] = 0.0
Next, for Brand, Product Line, and Product Size column, we will use the most brand sold. As you can see, SOLEX has the most number of bike sold. So we will replace with SOLEX for NAN brand values.
brands = transactions['brand'].value_counts().to_frame().reset_index()\
.rename(columns={'index': 'Brand', 'brand': 'Counts'})
product_line = transactions['product_line'].value_counts().to_frame().reset_index()\
.rename(columns={'index': 'Product Line', 'product_line': 'Counts'})
product_class = transactions['product_class'].value_counts().to_frame().reset_index()\
.rename(columns={'index': 'Product Class', 'product_class': 'Counts'})
product_size = transactions['product_size'].value_counts().to_frame().reset_index()\
.rename(columns={'index': 'Product Size', 'product_size': 'Counts'})
fig = make_subplots(rows=4, cols=1)
fig.add_trace(go.Bar(x=brands['Brand'], y=brands['Counts'], name='Brand',
marker=dict(color='#ffdfd3')), 1, 1)
fig.add_trace(go.Bar(x=product_line['Product Line'], y=product_line['Counts'], name='Product Line',
marker=dict(color='#957dad')), 2, 1)
fig.add_trace(go.Bar(x=product_class['Product Class'], y=product_class['Counts'], name='Product Class',
marker=dict(color='#fec8d8')), 3, 1)
fig.add_trace(go.Bar(x=product_size['Product Size'], y=product_size['Counts'], name='Product Size',
marker=dict(color='#e0bbe4')), 4, 1)
fig.update_layout(template='none', title='The Most Frequent Values')
fig.update_yaxes(showgrid=False, showticklabels=False)
fig.show()
transactions['brand'] = transactions['brand'].fillna(transactions['brand'].mode().iloc[0])
transactions['product_line'] = transactions['product_line'].fillna(transactions['product_line'].mode().iloc[0])
transactions['product_class'] = transactions['product_class'].fillna(transactions['product_class'].mode().iloc[0])
transactions['product_size'] = transactions['product_size'].fillna(transactions['product_size'].mode().iloc[0])
For a Standard Cost column, we will use average cost to fill the NULL values.
transactions['standard_cost'] = transactions['standard_cost'].fillna(transactions['standard_cost'].mean())
Unfortunately, the data value for the Product First Sold Date is very unclelar. And it does not make sense to guess the date, so we will drop the entire column. If there is more information, we will be able to create a new column for it. But for now, to clean, we will drop.
transactions.drop(columns='product_first_sold_date', inplace=True)
Finally, cleaned data
transactions.head()
transactions.info()
headerColor = '#dade9b'
rowSpecialColor = '#ffeb74'
rowColor = 'white'
nullValues = [demographic[item].isna().sum() for item in demographic.columns]
allValues = [demographic.shape[0] for item in demographic.columns]
percentages = [round((demographic[item].isna().sum() / demographic.shape[0])*100, 2) \
for item in demographic.columns]
colors = [rowSpecialColor if x == max(nullValues) else rowColor for x in nullValues]
fig = go.Figure(data=[go.Table(header=dict(values=['<b>Columns</b>','<b>NULL</b>',
'<b>All Values</b>','<b> Percentage (%)</b>'],
line_color='darkslategray', fill_color=headerColor,
align=['left', 'center'], font=dict(color='darkslategray', size=12)),
cells=dict(values=[demographic.columns, nullValues, allValues, percentages],
line_color='darkslategray',
fill_color = [colors],
align = ['left', 'center'], font=dict(color='black', size=11)))])
fig.update_layout(title='Demographic')
fig.show()
Here we are going to replace the null job industry category with the most frequent job industry category in each job titles. For example, if job title 'Account Coordinator' is the most frequent in industry 'property', we will replace all the null job industry category value with 'property'.
frequent_jobs = demographic.groupby(['job_title', 'job_industry_category'], as_index=False)['customer_id'].count()
freq_lookup = frequent_jobs.groupby('job_title').apply(lambda x: x['job_industry_category'][x['customer_id'].idxmax()])\
.to_frame().reset_index().rename(columns={0: 'job_industry_category'})
missing = demographic['job_industry_category'].isna()
demographic.loc[missing, 'job_industry_category'] = demographic.loc[missing,
'job_title'].map(freq_lookup.set_index('job_title')['job_industry_category'])
demographic.loc[demographic['job_industry_category'].isna() == True].shape
freq_lookup_ind = frequent_jobs.groupby('job_industry_category').apply(lambda x: x['job_title'][x['customer_id'].idxmax()])\
.to_frame().reset_index().rename(columns={0: 'job_title'})
missing = demographic['job_title'].isna()
demographic.loc[missing, 'job_title'] = demographic.loc[missing,
'job_industry_category'].map(freq_lookup_ind.set_index('job_industry_category')['job_title'])
demographic.loc[demographic['job_title'].isna() == True].shape
We have reduced the NULL value in job industry category and job title from 656 to 105. The 105 data does not have both job title and job industry category. We cannot make any guess on the customers' jobs, so we will leave them as NaN at the moment. Additionally, last names and birthdays cannot be also guessed, so we will leave them as NaN.
For the default columns, it seems meaningless, so we will drop it.
demographic.drop(columns='default', inplace=True)
demographic.head(5)
For the tenure column, we will find the mean tenure year for each job, since tenure usually depends on the job. Then we will fill null values with those mean.
mean_tenure_lookup = demographic.groupby('job_title').mean()['tenure']
missing = demographic['tenure'].isna()
demographic.loc[missing, 'tenure'] = demographic.loc[missing, 'job_title'].map(mean_tenure_lookup)
demographic.loc[demographic['tenure'].isna() == True].shape
We will clean up the gender columns.
demographic['gender'].value_counts()
demographic['gender'].replace({'F':'Female', 'M':'Male', 'Femal':'Female', 'U':np.NaN}, inplace=True)
demographic['gender'].value_counts()
For the birthday, we will extract the age and add it as a new column called age. We will check the minimum and maxmimum age.
now = pd.to_datetime('now')
demographic['age'] = (now - demographic['DOB']).astype('<m8[Y]')
demographic['age'].min()
demographic['age'].max()
Since age 176 seems like an outlier, we will replace with the average age of the customers.
demographic.loc[demographic['age'] == 176.0, 'age'] = round(demographic['age'].mean())
Although there are still null values, we will procede.
demographic.info()
headerColor = '#dade9b'
rowColor = 'white'
nullValues = [address[item].isna().sum() for item in address.columns]
allValues = [address.shape[0] for item in address.columns]
percentages = [round((address[item].isna().sum() / address.shape[0])*100, 2) \
for item in address.columns]
fig = go.Figure(data=[go.Table(header=dict(values=['<b>Columns</b>','<b>NULL</b>',
'<b>All Values</b>','<b> Percentage (%)</b>'],
line_color='darkslategray', fill_color=headerColor,
align=['left', 'center'], font=dict(color='darkslategray', size=12)),
cells=dict(values=[address.columns, nullValues, allValues, percentages],
line_color='darkslategray',
fill_color = [rowColor],
align = ['left', 'center'], font=dict(color='black', size=11)))])
fig.update_layout(title='Address')
fig.show()
address.replace({'QLD': 'Queensland', 'VIC': 'Victoria', 'NSW': 'New South Wales'}, inplace=True)
address['state'].value_counts()
trans_dates = transactions.groupby('transaction_date').count()['transaction_id']\
.to_frame().reset_index().rename(columns={'transaction_id':'counts'})
trans_dates['MA20'] = trans_dates['counts'].rolling(window=20).mean()
fig = px.line(trans_dates, x='transaction_date', y="MA20", template='none',
title='Number of Transactions in Each Months')
fig.update_yaxes(title='Counts', showgrid=False)
fig.update_xaxes(title='', showgrid=False)
fig.show()
trans_online = transactions['online_order'].value_counts().to_frame().reset_index()\
.rename(columns={'index':'Online?', 'online_order':'Counts'}).replace({1.0:'Yes', 0.0:'No'})
trans_status = transactions['order_status'].value_counts().to_frame().reset_index()\
.rename(columns={'index':'Status', 'order_status':'Counts'})
fig = make_subplots(rows=1, cols=2, shared_yaxes=True,
subplot_titles=('Online Order?', 'Status'))
fig.add_trace(go.Bar(x=trans_online['Online?'] ,y=trans_online['Counts'], name=''),
row=1, col=1)
fig.add_trace(go.Bar(x=trans_status['Status'], y=trans_status['Counts'], name=''),
row=1, col=2)
fig.update_layout(template='none')
fig.update_yaxes(showgrid=False)
fig.show()
most_pop_brand = transactions['brand'].value_counts().to_frame().reset_index()\
.rename(columns={'index':'Brand', 'brand':'Counts'})
fig = px.pie(most_pop_brand, values='Counts', names='Brand',
color_discrete_sequence=px.colors.sequential.Purp,
title='The Most Popular Brand')
fig.show()
fig = go.Figure()
fig.add_trace(go.Box(y=transactions['list_price'], notched=True, name='List Price'))
fig.add_trace(go.Box(y=transactions['standard_cost'], notched=True, name='Standard Cost'))
fig.update_layout(template='none', title='List Price and Standard Cost')
fig.show()
fig = px.histogram(demographic, x="gender", opacity=0.5,
color_discrete_sequence=['indianred'],
template='none', title='Gender of the Customers')
fig.update_yaxes(range=[0, 2300])
fig.show()
fig = px.histogram(demographic, x='age', opacity=0.5,
color_discrete_sequence=['indianred'],
template='none', title='Age of the Customers')
fig.show()
job_ind_cat = demographic['job_industry_category'].value_counts().to_frame()\
.reset_index().rename(columns={'index':'Job Industry Category', 'job_industry_category':'Counts'})
fig = px.pie(job_ind_cat, values='Counts', names='Job Industry Category',
color_discrete_sequence=px.colors.sequential.Purp,
title='Job Industry')
fig.show()
wealth = demographic['wealth_segment'].value_counts().to_frame()\
.reset_index().rename(columns={'index':'Wealth Segment', 'wealth_segment':'Counts'})
car = demographic['owns_car'].value_counts().to_frame()\
.reset_index().rename(columns={'index':'Car Ownership', 'owns_car':'Counts'})
car
colors = ['darkorange', 'mediumturquoise']
fig = make_subplots(rows=1, cols=2, specs=[[{"type": "xy"}, {"type": "domain"}]],
subplot_titles=('Wealth Segment', 'Car Ownership'))
fig.add_trace(go.Bar(x=wealth['Wealth Segment'], y=wealth['Counts'],
name='Wealth Segment', marker_color='indianred'), row=1, col=1)
fig.add_trace(go.Pie(labels=car['Car Ownership'],values=car['Counts'],
name='Car Ownership', marker=dict(colors=colors, line=dict(color='#000000', width=2))),
row=1, col=2)
fig.update_layout(template='none', showlegend=False)
fig.show()
fig = go.Figure(data=go.Violin(y=demographic['tenure'], box_visible=True, line_color='black',
meanline_visible=True, fillcolor='lightseagreen', opacity=0.6,
x0='Tenure'))
fig.update_layout(yaxis_zeroline=False, template='none', title='Tenure')
fig.update_yaxes(showgrid=False)
fig.show()
postCode = pd.read_csv('AUS_postcode.csv')
postCode.drop(columns=['suburb', 'state', 'dc', 'type'], inplace=True)
postCode = postCode.drop_duplicates(subset='postcode')
address = address.merge(postCode, on='postcode', how='left')
u_states = np.unique(address['state'].values)
d = dict(zip(u_states, np.arange(len(u_states))))
fig = go.Figure(data=go.Scattergeo(lon=address['lon'], lat=address['lat'],
mode='markers', marker_color=[d[s] for s in address['state']]))
fig.update_layout(title='The Customer Locations',
geo=dict(projection_scale=5,
center=dict(lat=-28.892778, lon=153.2268)))
fig.show()
transactions.head(5)
demographic.head(5)
address.head(5)
Overall, the datasets are not complete. The main problems include: