Link to datasets: https://www.kaggle.com/olistbr/brazilian-ecommerce
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt
import missingno as msno
import param
import panel as pn
import seaborn as sns
pn.extension()
orders = pd.read_csv("olist_orders_dataset.csv")
orders.head(3)
| order_id | customer_id | order_status | order_purchase_timestamp | order_approved_at | order_delivered_carrier_date | order_delivered_customer_date | order_estimated_delivery_date | |
|---|---|---|---|---|---|---|---|---|
| 0 | e481f51cbdc54678b7cc49136f2d6af7 | 9ef432eb6251297304e76186b10a928d | delivered | 2017-10-02 10:56:33 | 2017-10-02 11:07:15 | 2017-10-04 19:55:00 | 2017-10-10 21:25:13 | 2017-10-18 00:00:00 |
| 1 | 53cdb2fc8bc7dce0b6741e2150273451 | b0830fb4747a6c6d20dea0b8c802d7ef | delivered | 2018-07-24 20:41:37 | 2018-07-26 03:24:27 | 2018-07-26 14:31:00 | 2018-08-07 15:27:45 | 2018-08-13 00:00:00 |
| 2 | 47770eb9100c2d0c44946d9cf07ec65d | 41ce2a54c0b03bf3443c3d931a367089 | delivered | 2018-08-08 08:38:49 | 2018-08-08 08:55:23 | 2018-08-08 13:50:00 | 2018-08-17 18:06:29 | 2018-09-04 00:00:00 |
Looking into delivery
orders['order_purchase_timestamp'] = pd.to_datetime(orders.order_purchase_timestamp)
orders['order_approved_at'] = pd.to_datetime(orders.order_approved_at)
orders['order_delivered_carrier_date'] = pd.to_datetime(orders.order_delivered_carrier_date)
orders['order_delivered_customer_date'] = pd.to_datetime(orders.order_delivered_customer_date)
orders['order_estimated_delivery_date'] = pd.to_datetime(orders.order_estimated_delivery_date)
orders.head(3)
| order_id | customer_id | order_status | order_purchase_timestamp | order_approved_at | order_delivered_carrier_date | order_delivered_customer_date | order_estimated_delivery_date | |
|---|---|---|---|---|---|---|---|---|
| 0 | e481f51cbdc54678b7cc49136f2d6af7 | 9ef432eb6251297304e76186b10a928d | delivered | 2017-10-02 10:56:33 | 2017-10-02 11:07:15 | 2017-10-04 19:55:00 | 2017-10-10 21:25:13 | 2017-10-18 |
| 1 | 53cdb2fc8bc7dce0b6741e2150273451 | b0830fb4747a6c6d20dea0b8c802d7ef | delivered | 2018-07-24 20:41:37 | 2018-07-26 03:24:27 | 2018-07-26 14:31:00 | 2018-08-07 15:27:45 | 2018-08-13 |
| 2 | 47770eb9100c2d0c44946d9cf07ec65d | 41ce2a54c0b03bf3443c3d931a367089 | delivered | 2018-08-08 08:38:49 | 2018-08-08 08:55:23 | 2018-08-08 13:50:00 | 2018-08-17 18:06:29 | 2018-09-04 |
msno.bar(orders)
<AxesSubplot:>
Focus on delivered orders only
orders['order_status'].unique()
array(['delivered', 'invoiced', 'shipped', 'processing', 'unavailable',
'canceled', 'created', 'approved'], dtype=object)
delivered = orders[orders['order_status'] == 'delivered']
delivered = delivered.drop(columns="order_status")
delivered.head(3)
| order_id | customer_id | order_purchase_timestamp | order_approved_at | order_delivered_carrier_date | order_delivered_customer_date | order_estimated_delivery_date | |
|---|---|---|---|---|---|---|---|
| 0 | e481f51cbdc54678b7cc49136f2d6af7 | 9ef432eb6251297304e76186b10a928d | 2017-10-02 10:56:33 | 2017-10-02 11:07:15 | 2017-10-04 19:55:00 | 2017-10-10 21:25:13 | 2017-10-18 |
| 1 | 53cdb2fc8bc7dce0b6741e2150273451 | b0830fb4747a6c6d20dea0b8c802d7ef | 2018-07-24 20:41:37 | 2018-07-26 03:24:27 | 2018-07-26 14:31:00 | 2018-08-07 15:27:45 | 2018-08-13 |
| 2 | 47770eb9100c2d0c44946d9cf07ec65d | 41ce2a54c0b03bf3443c3d931a367089 | 2018-08-08 08:38:49 | 2018-08-08 08:55:23 | 2018-08-08 13:50:00 | 2018-08-17 18:06:29 | 2018-09-04 |
msno.bar(delivered)
<AxesSubplot:>
delivered = delivered.dropna()
msno.bar(delivered)
<AxesSubplot:>
Focus on 2017 and 2018 only, since very little data from 2016.
delivered['order_purchase_timestamp'].dt.year.value_counts().plot.bar()
delivered = orders[orders['order_purchase_timestamp'].dt.year > 2016]
Overall sales increased in 2018.
class DeliveredDashboard(param.Parameterized):
year = param.ObjectSelector(default=2017, objects=[2017, 2018])
plt.figure(figsize=(12,8), dpi= 100)
def sales_plot(self):
plt.clf()
delivered[delivered['order_purchase_timestamp'].dt.year==self.year]['order_purchase_timestamp'].dt.month.value_counts().plot.bar()
return plt.gcf()
def total_plot(self):
plt.clf()
delivered['order_purchase_timestamp'].dt.year.value_counts().plot.bar()
return plt.gcf()
def append_end(self):
plt.clf()
return ""
dd1 = DeliveredDashboard(name='')
dashboard = pn.Column('Sales Dashboard',
dd1.param,
pn.Tabs(
('Sales by year', dd1.sales_plot),
('Total Sales', dd1.total_plot),
('',dd1.append_end)
)
)
dashboard.embed()
<Figure size 1728x1152 with 0 Axes>
Counting delivery speed and delay
delivered['order_delay'] = (delivered['order_delivered_customer_date'] - delivered['order_estimated_delivery_date']).dt.components['days']
delivered['order_duration'] = (delivered['order_delivered_customer_date'] - delivered['order_purchase_timestamp']).dt.components['days']
delivered['year'] = np.where((delivered.order_purchase_timestamp.dt.year == 2017), 2017, 2018)
delivered.head(3)
<ipython-input-10-6e47f4c74239>:1: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy delivered['order_delay'] = (delivered['order_delivered_customer_date'] - delivered['order_estimated_delivery_date']).dt.components['days'] <ipython-input-10-6e47f4c74239>:2: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy delivered['order_duration'] = (delivered['order_delivered_customer_date'] - delivered['order_purchase_timestamp']).dt.components['days'] <ipython-input-10-6e47f4c74239>:3: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy delivered['year'] = np.where((delivered.order_purchase_timestamp.dt.year == 2017), 2017, 2018)
| order_id | customer_id | order_status | order_purchase_timestamp | order_approved_at | order_delivered_carrier_date | order_delivered_customer_date | order_estimated_delivery_date | order_delay | order_duration | year | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | e481f51cbdc54678b7cc49136f2d6af7 | 9ef432eb6251297304e76186b10a928d | delivered | 2017-10-02 10:56:33 | 2017-10-02 11:07:15 | 2017-10-04 19:55:00 | 2017-10-10 21:25:13 | 2017-10-18 | -8.0 | 8.0 | 2017 |
| 1 | 53cdb2fc8bc7dce0b6741e2150273451 | b0830fb4747a6c6d20dea0b8c802d7ef | delivered | 2018-07-24 20:41:37 | 2018-07-26 03:24:27 | 2018-07-26 14:31:00 | 2018-08-07 15:27:45 | 2018-08-13 | -6.0 | 13.0 | 2018 |
| 2 | 47770eb9100c2d0c44946d9cf07ec65d | 41ce2a54c0b03bf3443c3d931a367089 | delivered | 2018-08-08 08:38:49 | 2018-08-08 08:55:23 | 2018-08-08 13:50:00 | 2018-08-17 18:06:29 | 2018-09-04 | -18.0 | 9.0 | 2018 |
delivered_stats = ['order_duration', 'order_delay']
def remove_outlier_iqr(df, col):
IQR = df[col].quantile(0.25) - df[col].quantile(0.75)
new_df = df.loc[(df[col] > (df[col].quantile(0.25) + 1.5 * IQR)) & (df[col] < (df[col].quantile(0.75) - 1.5 * IQR))]
return(new_df)
class DeliveredDashboard2(param.Parameterized):
stats = param.ObjectSelector(default='order_duration', objects=delivered_stats)
plt.figure(figsize=(12,8), dpi= 100)
def original_boxplot(self):
plt.clf()
sns.boxplot(x=delivered[self.stats])
return plt.gcf()
def after_boxplot(self):
plt.clf()
order_no_outlier = remove_outlier_iqr(delivered, 'order_duration')
sns.boxplot(x=order_no_outlier[self.stats])
return plt.gcf()
def append_end(self):
plt.clf()
return ""
dd2 = DeliveredDashboard2(name='')
dashboard = pn.Column('Outlier Dashboard',
dd2.param,
pn.Tabs(
('Original', dd2.original_boxplot),
('Outlier removed', dd2.after_boxplot),
('',dd2.append_end)
)
)
dashboard.embed()
<Figure size 1728x1152 with 0 Axes>
class DeliveredDashboard3(param.Parameterized):
stats = param.ObjectSelector(default='order_duration', objects=delivered_stats)
plt.figure(figsize=(12,8), dpi= 100)
def after_boxplot(self, year):
plt.clf()
order_no_outlier = remove_outlier_iqr(delivered[delivered['year']==year], self.stats)
sns.boxplot(x=order_no_outlier[self.stats])
return plt.gcf()
def boxplot_2017(self):
return self.after_boxplot(2017)
def boxplot_2018(self):
return self.after_boxplot(2018)
def append_end(self):
plt.clf()
return ""
dd3 = DeliveredDashboard3(name='')
dashboard = pn.Column('Delivered Outlier Dashboard',
dd3.param,
pn.Tabs(
('2017', dd3.boxplot_2017),
('2018', dd3.boxplot_2018),
('',dd3.append_end)
)
)
dashboard.embed()
<Figure size 1728x1152 with 0 Axes>
Problem identified: On average, delivery duration has decreased in 2018 and the delay is more than -10 days. At an average of 9 days of delivery, it can still be improved.
customer_visit_count = orders.groupby(['customer_id']).size().reset_index(name='count')
customer_visit_count.head(3)
| customer_id | count | |
|---|---|---|
| 0 | 00012a2ce6f8dcda20d059ce98491703 | 1 |
| 1 | 000161a058600d5901f007fab4c27140 | 1 |
| 2 | 0001fd6190edaaf884bcaf3d49edf079 | 1 |
Risk: None of the customers were returning customers.
plt.figure(figsize=(12,8), dpi= 100)
sns.boxplot(x=customer_visit_count['count'])
<AxesSubplot:xlabel='count'>
order_items = pd.read_csv("olist_order_items_dataset.csv")
order_items.head(3)
| order_id | order_item_id | product_id | seller_id | shipping_limit_date | price | freight_value | |
|---|---|---|---|---|---|---|---|
| 0 | 00010242fe8c5a6d1ba2dd792cb16214 | 1 | 4244733e06e7ecb4970a6e2683c13e61 | 48436dade18ac8b2bce089ec2a041202 | 2017-09-19 09:45:35 | 58.9 | 13.29 |
| 1 | 00018f77f2f0320c557190d7a144bdd3 | 1 | e5f2d52b802189ee658865ca93d83a8f | dd7ddc04e1b6c2c614352b383efe2d36 | 2017-05-03 11:05:13 | 239.9 | 19.93 |
| 2 | 000229ec398224ef6ca0657da4fc703e | 1 | c777355d18b72b67abbeef9df44fd0fd | 5b51032eddd242adc84c38acab88f23d | 2018-01-18 14:48:30 | 199.0 | 17.87 |
msno.bar(order_items)
<AxesSubplot:>
order_price = order_items.groupby('order_id').sum('price')
order_price.head(3)
| order_item_id | price | freight_value | |
|---|---|---|---|
| order_id | |||
| 00010242fe8c5a6d1ba2dd792cb16214 | 1 | 58.9 | 13.29 |
| 00018f77f2f0320c557190d7a144bdd3 | 1 | 239.9 | 19.93 |
| 000229ec398224ef6ca0657da4fc703e | 1 | 199.0 | 17.87 |
class OrderPriceDashboard(param.Parameterized):
stats = param.ObjectSelector(default='Original', objects=['Original', 'Outlier removed'])
plt.figure(figsize=(12,8), dpi= 100)
def after_boxplot(self):
plt.clf()
if self.stats == 'Original':
sns.boxplot(x=order_price['price'])
else:
order_no_outlier = remove_outlier_iqr(order_price, 'price')
sns.boxplot(x=order_no_outlier['price'])
return plt.gcf()
def append_end(self):
plt.clf()
return ""
op1 = OrderPriceDashboard(name='')
dashboard = pn.Column('Order Price Outlier Dashboard',
op1.param,
op1.after_boxplot,
op1.append_end
)
dashboard.embed()
<Figure size 1728x1152 with 0 Axes>
Because all the customers have only one order and are not returning customers, revenue generated by each customer was mostly only around 50 to 150.
order_reviews = pd.read_csv("olist_order_reviews_dataset.csv")
order_reviews.head(3)
| review_id | order_id | review_score | review_comment_title | review_comment_message | review_creation_date | review_answer_timestamp | |
|---|---|---|---|---|---|---|---|
| 0 | 7bc2406110b926393aa56f80a40eba40 | 73fc7af87114b39712e6da79b0a377eb | 4 | NaN | NaN | 2018-01-18 00:00:00 | 2018-01-18 21:46:59 |
| 1 | 80e641a11e56f04c1ad469d5645fdfde | a548910a1c6147796b98fdf73dbeba33 | 5 | NaN | NaN | 2018-03-10 00:00:00 | 2018-03-11 03:05:13 |
| 2 | 228ce5500dc1d8e020d8d1322874b6f0 | f9e4b658b201a9f2ecdecbb34bed034b | 5 | NaN | NaN | 2018-02-17 00:00:00 | 2018-02-18 14:36:24 |
msno.bar(order_reviews)
<AxesSubplot:>
I have used Google Translate to translate the reviews from Portugese to English, then saved the output in a local html file. So, I am scraping from the html file.
import codecs
from bs4 import BeautifulSoup
def soup_table(file):
f=codecs.open(file, 'r')
html = BeautifulSoup(f, 'html.parser')
html = html.find_all('td', {'align':'left'})
txt_list = []
for tr in html:
txt_list.append(tr.text)
return(txt_list)
english_title = soup_table('english_title.htm')
english_title.pop(0)
len(english_title)
11715
Based on the number, I can assume that all comments were translated and successfully scraped.
english_comment = soup_table('english_comment.htm')
english_comment.pop(0)
len(english_comment)
41753
Because Google Translate only accepts .xlsx, I have to read the excel back into a dataframe to map the translation back to its review id.
from pandas import read_excel
title_xlsx = read_excel('english_title.xlsx')
title_xlsx.head(3)
| Unnamed: 0 | review_id | review_comment_title | |
|---|---|---|---|
| 0 | 9 | 8670d52e15e00043ae7de4c01cc2fe06 | recomendo |
| 1 | 15 | 3948b09f7c818e2d86c9a546758b2335 | super recomendo |
| 2 | 19 | 373cbeecea8286a2b66c97b1b157ec46 | negação chegou produto |
title_xlsx = title_xlsx.drop(title_xlsx.columns[[0, 2]], axis=1)
title_xlsx['translated_title'] = english_title
title_xlsx.head(3)
| review_id | translated_title | |
|---|---|---|
| 0 | 8670d52e15e00043ae7de4c01cc2fe06 | I recommend |
| 1 | 3948b09f7c818e2d86c9a546758b2335 | super recommend |
| 2 | 373cbeecea8286a2b66c97b1b157ec46 | denial arrived product |
comment_xlsx = read_excel('english_comment.xlsx')
comment_xlsx.head(3)
| Unnamed: 0 | review_id | review_comment_message | |
|---|---|---|---|
| 0 | 3 | e64fb393e7b32834bb789ff8bb30750e | recebi bem antes prazo estipulado |
| 1 | 4 | f7c4243c7fe1938f181bec41a392bdeb | parabéns lojas lannister adorei comprar intern... |
| 2 | 9 | 8670d52e15e00043ae7de4c01cc2fe06 | aparelho eficiente site marca aparelho impress... |
comment_xlsx = comment_xlsx.drop(comment_xlsx.columns[[0, 2]], axis=1)
comment_xlsx['translated_comment'] = english_comment
comment_xlsx.head(3)
| review_id | translated_comment | |
|---|---|---|
| 0 | e64fb393e7b32834bb789ff8bb30750e | I received it well before the deadline |
| 1 | f7c4243c7fe1938f181bec41a392bdeb | congratulations lannister stores loved buying ... |
| 2 | 8670d52e15e00043ae7de4c01cc2fe06 | efficient device website brand device printed ... |
translated_order_reviews = pd.merge(order_reviews, title_xlsx, left_on='review_id', right_on='review_id', how='left')
translated_order_reviews = pd.merge(translated_order_reviews, comment_xlsx, left_on='review_id', right_on='review_id', how='left')
translated_order_reviews.head(10)
| review_id | order_id | review_score | review_comment_title | review_comment_message | review_creation_date | review_answer_timestamp | translated_title | translated_comment | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | 7bc2406110b926393aa56f80a40eba40 | 73fc7af87114b39712e6da79b0a377eb | 4 | NaN | NaN | 2018-01-18 00:00:00 | 2018-01-18 21:46:59 | NaN | NaN |
| 1 | 80e641a11e56f04c1ad469d5645fdfde | a548910a1c6147796b98fdf73dbeba33 | 5 | NaN | NaN | 2018-03-10 00:00:00 | 2018-03-11 03:05:13 | NaN | NaN |
| 2 | 228ce5500dc1d8e020d8d1322874b6f0 | f9e4b658b201a9f2ecdecbb34bed034b | 5 | NaN | NaN | 2018-02-17 00:00:00 | 2018-02-18 14:36:24 | NaN | NaN |
| 3 | e64fb393e7b32834bb789ff8bb30750e | 658677c97b385a9be170737859d3511b | 5 | NaN | Recebi bem antes do prazo estipulado. | 2017-04-21 00:00:00 | 2017-04-21 22:02:06 | NaN | I received it well before the deadline |
| 4 | f7c4243c7fe1938f181bec41a392bdeb | 8e6bfb81e283fa7e4f11123a3fb894f1 | 5 | NaN | Parabéns lojas lannister adorei comprar pela I... | 2018-03-01 00:00:00 | 2018-03-02 10:26:53 | NaN | congratulations lannister stores loved buying ... |
| 5 | 15197aa66ff4d0650b5434f1b46cda19 | b18dcdf73be66366873cd26c5724d1dc | 1 | NaN | NaN | 2018-04-13 00:00:00 | 2018-04-16 00:39:37 | NaN | NaN |
| 6 | 07f9bee5d1b850860defd761afa7ff16 | e48aa0d2dcec3a2e87348811bcfdf22b | 5 | NaN | NaN | 2017-07-16 00:00:00 | 2017-07-18 19:30:34 | NaN | NaN |
| 7 | 7c6400515c67679fbee952a7525281ef | c31a859e34e3adac22f376954e19b39d | 5 | NaN | NaN | 2018-08-14 00:00:00 | 2018-08-14 21:36:06 | NaN | NaN |
| 8 | a3f6f7f6f433de0aefbb97da197c554c | 9c214ac970e84273583ab523dfafd09b | 5 | NaN | NaN | 2017-05-17 00:00:00 | 2017-05-18 12:05:37 | NaN | NaN |
| 9 | 8670d52e15e00043ae7de4c01cc2fe06 | b9bf720beb4ab3728760088589c62129 | 4 | recomendo | aparelho eficiente. no site a marca do aparelh... | 2018-05-22 00:00:00 | 2018-05-23 16:45:47 | I recommend | efficient device website brand device printed ... |
msno.bar(translated_order_reviews)
<AxesSubplot:>
delivery_pattern = 'early|late|deadline|stipulate|delay|deliver|fast|slow|schedule|ahead|arrive|receive|time|return|cancel|track|trip|came|speed|wait|courier|transit|month|address|day|damage|ship|condition|mail|week|package|condition|date|freight|trace'
translated_order_reviews['delivery_related'] = ((translated_order_reviews['translated_comment'].str.contains(delivery_pattern)) | (translated_order_reviews['translated_title'].str.contains(delivery_pattern)))
translated_order_reviews.loc[pd.isnull(translated_order_reviews['translated_comment']), 'delivery_related'] = 'Null'
translated_order_reviews.head(10)
| review_id | order_id | review_score | review_comment_title | review_comment_message | review_creation_date | review_answer_timestamp | translated_title | translated_comment | delivery_related | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 7bc2406110b926393aa56f80a40eba40 | 73fc7af87114b39712e6da79b0a377eb | 4 | NaN | NaN | 2018-01-18 00:00:00 | 2018-01-18 21:46:59 | NaN | NaN | Null |
| 1 | 80e641a11e56f04c1ad469d5645fdfde | a548910a1c6147796b98fdf73dbeba33 | 5 | NaN | NaN | 2018-03-10 00:00:00 | 2018-03-11 03:05:13 | NaN | NaN | Null |
| 2 | 228ce5500dc1d8e020d8d1322874b6f0 | f9e4b658b201a9f2ecdecbb34bed034b | 5 | NaN | NaN | 2018-02-17 00:00:00 | 2018-02-18 14:36:24 | NaN | NaN | Null |
| 3 | e64fb393e7b32834bb789ff8bb30750e | 658677c97b385a9be170737859d3511b | 5 | NaN | Recebi bem antes do prazo estipulado. | 2017-04-21 00:00:00 | 2017-04-21 22:02:06 | NaN | I received it well before the deadline | True |
| 4 | f7c4243c7fe1938f181bec41a392bdeb | 8e6bfb81e283fa7e4f11123a3fb894f1 | 5 | NaN | Parabéns lojas lannister adorei comprar pela I... | 2018-03-01 00:00:00 | 2018-03-02 10:26:53 | NaN | congratulations lannister stores loved buying ... | False |
| 5 | 15197aa66ff4d0650b5434f1b46cda19 | b18dcdf73be66366873cd26c5724d1dc | 1 | NaN | NaN | 2018-04-13 00:00:00 | 2018-04-16 00:39:37 | NaN | NaN | Null |
| 6 | 07f9bee5d1b850860defd761afa7ff16 | e48aa0d2dcec3a2e87348811bcfdf22b | 5 | NaN | NaN | 2017-07-16 00:00:00 | 2017-07-18 19:30:34 | NaN | NaN | Null |
| 7 | 7c6400515c67679fbee952a7525281ef | c31a859e34e3adac22f376954e19b39d | 5 | NaN | NaN | 2018-08-14 00:00:00 | 2018-08-14 21:36:06 | NaN | NaN | Null |
| 8 | a3f6f7f6f433de0aefbb97da197c554c | 9c214ac970e84273583ab523dfafd09b | 5 | NaN | NaN | 2017-05-17 00:00:00 | 2017-05-18 12:05:37 | NaN | NaN | Null |
| 9 | 8670d52e15e00043ae7de4c01cc2fe06 | b9bf720beb4ab3728760088589c62129 | 4 | recomendo | aparelho eficiente. no site a marca do aparelh... | 2018-05-22 00:00:00 | 2018-05-23 16:45:47 | I recommend | efficient device website brand device printed ... | True |
translated_order_reviews = pd.merge(translated_order_reviews, orders, left_on='order_id', right_on='order_id', how='left')
translated_order_reviews['year'] = np.where((translated_order_reviews.order_purchase_timestamp.dt.year == 2017), 2017, 2018)
translated_order_reviews.head(3)
| review_id | order_id | review_score | review_comment_title | review_comment_message | review_creation_date | review_answer_timestamp | translated_title | translated_comment | delivery_related | customer_id | order_status | order_purchase_timestamp | order_approved_at | order_delivered_carrier_date | order_delivered_customer_date | order_estimated_delivery_date | year | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 7bc2406110b926393aa56f80a40eba40 | 73fc7af87114b39712e6da79b0a377eb | 4 | NaN | NaN | 2018-01-18 00:00:00 | 2018-01-18 21:46:59 | NaN | NaN | Null | 41dcb106f807e993532d446263290104 | delivered | 2018-01-11 15:30:49 | 2018-01-11 15:47:59 | 2018-01-12 21:57:22 | 2018-01-17 18:42:41 | 2018-02-02 | 2018 |
| 1 | 80e641a11e56f04c1ad469d5645fdfde | a548910a1c6147796b98fdf73dbeba33 | 5 | NaN | NaN | 2018-03-10 00:00:00 | 2018-03-11 03:05:13 | NaN | NaN | Null | 8a2e7ef9053dea531e4dc76bd6d853e6 | delivered | 2018-02-28 12:25:19 | 2018-02-28 12:48:39 | 2018-03-02 19:08:15 | 2018-03-09 23:17:20 | 2018-03-14 | 2018 |
| 2 | 228ce5500dc1d8e020d8d1322874b6f0 | f9e4b658b201a9f2ecdecbb34bed034b | 5 | NaN | NaN | 2018-02-17 00:00:00 | 2018-02-18 14:36:24 | NaN | NaN | Null | e226dfed6544df5b7b87a48208690feb | delivered | 2018-02-03 09:56:22 | 2018-02-03 10:33:41 | 2018-02-06 16:18:28 | 2018-02-16 17:28:48 | 2018-03-09 | 2018 |
translated_order_reviews['order_duration'] = (translated_order_reviews['order_delivered_customer_date'] - translated_order_reviews['order_purchase_timestamp']).dt.components['days']
translated_order_reviews.head(3)
| review_id | order_id | review_score | review_comment_title | review_comment_message | review_creation_date | review_answer_timestamp | translated_title | translated_comment | delivery_related | customer_id | order_status | order_purchase_timestamp | order_approved_at | order_delivered_carrier_date | order_delivered_customer_date | order_estimated_delivery_date | year | order_duration | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 7bc2406110b926393aa56f80a40eba40 | 73fc7af87114b39712e6da79b0a377eb | 4 | NaN | NaN | 2018-01-18 00:00:00 | 2018-01-18 21:46:59 | NaN | NaN | Null | 41dcb106f807e993532d446263290104 | delivered | 2018-01-11 15:30:49 | 2018-01-11 15:47:59 | 2018-01-12 21:57:22 | 2018-01-17 18:42:41 | 2018-02-02 | 2018 | 6.0 |
| 1 | 80e641a11e56f04c1ad469d5645fdfde | a548910a1c6147796b98fdf73dbeba33 | 5 | NaN | NaN | 2018-03-10 00:00:00 | 2018-03-11 03:05:13 | NaN | NaN | Null | 8a2e7ef9053dea531e4dc76bd6d853e6 | delivered | 2018-02-28 12:25:19 | 2018-02-28 12:48:39 | 2018-03-02 19:08:15 | 2018-03-09 23:17:20 | 2018-03-14 | 2018 | 9.0 |
| 2 | 228ce5500dc1d8e020d8d1322874b6f0 | f9e4b658b201a9f2ecdecbb34bed034b | 5 | NaN | NaN | 2018-02-17 00:00:00 | 2018-02-18 14:36:24 | NaN | NaN | Null | e226dfed6544df5b7b87a48208690feb | delivered | 2018-02-03 09:56:22 | 2018-02-03 10:33:41 | 2018-02-06 16:18:28 | 2018-02-16 17:28:48 | 2018-03-09 | 2018 | 13.0 |
class ReviewDashboard(param.Parameterized):
stats = param.ObjectSelector(default='review_score', objects=['review_score', 'order_duration'])
sns.set(style="darkgrid")
plt.figure(figsize=(12,8), dpi= 100)
def sns_plot(self):
plt.clf()
splot = sns.barplot(x="year", y=self.stats, hue="delivery_related", data=translated_order_reviews, ci=None)
for p in splot.patches:
splot.annotate(format(p.get_height(), '.1f'),
(p.get_x() + p.get_width() / 2., p.get_height()),
ha = 'center', va = 'center',
xytext = (0, 9),
textcoords = 'offset points')
return plt.gcf()
def append_end(self):
plt.clf()
return ""
rd1 = ReviewDashboard(name='')
dashboard = pn.Column('Review Dashboard',
rd1.param,
rd1.sns_plot,
rd1.append_end
)
dashboard.embed()
<Figure size 1728x1152 with 0 Axes>
While the average review score is above 3, it tends to be lower for reviews that are related to delivery. Just like there is some improvement in delivery speed, there is slight improvement in this case too.
Average rating for reviews that mention about delivery has increased.
Average days taken to deliver has decreased for reviews that mentioned about delivery.
delivered['approval_hours'] = (delivered['order_approved_at'] - delivered['order_purchase_timestamp']).dt.components['hours']
delivered.head(3)
<ipython-input-33-838e32387169>:1: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy delivered['approval_hours'] = (delivered['order_approved_at'] - delivered['order_purchase_timestamp']).dt.components['hours']
| order_id | customer_id | order_status | order_purchase_timestamp | order_approved_at | order_delivered_carrier_date | order_delivered_customer_date | order_estimated_delivery_date | order_delay | order_duration | year | approval_hours | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | e481f51cbdc54678b7cc49136f2d6af7 | 9ef432eb6251297304e76186b10a928d | delivered | 2017-10-02 10:56:33 | 2017-10-02 11:07:15 | 2017-10-04 19:55:00 | 2017-10-10 21:25:13 | 2017-10-18 | -8.0 | 8.0 | 2017 | 0.0 |
| 1 | 53cdb2fc8bc7dce0b6741e2150273451 | b0830fb4747a6c6d20dea0b8c802d7ef | delivered | 2018-07-24 20:41:37 | 2018-07-26 03:24:27 | 2018-07-26 14:31:00 | 2018-08-07 15:27:45 | 2018-08-13 | -6.0 | 13.0 | 2018 | 6.0 |
| 2 | 47770eb9100c2d0c44946d9cf07ec65d | 41ce2a54c0b03bf3443c3d931a367089 | delivered | 2018-08-08 08:38:49 | 2018-08-08 08:55:23 | 2018-08-08 13:50:00 | 2018-08-17 18:06:29 | 2018-09-04 | -18.0 | 9.0 | 2018 | 0.0 |
orders_approval_hours = remove_outlier_iqr(delivered, 'approval_hours')
orders_approval_hours.head()
| order_id | customer_id | order_status | order_purchase_timestamp | order_approved_at | order_delivered_carrier_date | order_delivered_customer_date | order_estimated_delivery_date | order_delay | order_duration | year | approval_hours | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | e481f51cbdc54678b7cc49136f2d6af7 | 9ef432eb6251297304e76186b10a928d | delivered | 2017-10-02 10:56:33 | 2017-10-02 11:07:15 | 2017-10-04 19:55:00 | 2017-10-10 21:25:13 | 2017-10-18 | -8.0 | 8.0 | 2017 | 0.0 |
| 1 | 53cdb2fc8bc7dce0b6741e2150273451 | b0830fb4747a6c6d20dea0b8c802d7ef | delivered | 2018-07-24 20:41:37 | 2018-07-26 03:24:27 | 2018-07-26 14:31:00 | 2018-08-07 15:27:45 | 2018-08-13 | -6.0 | 13.0 | 2018 | 6.0 |
| 2 | 47770eb9100c2d0c44946d9cf07ec65d | 41ce2a54c0b03bf3443c3d931a367089 | delivered | 2018-08-08 08:38:49 | 2018-08-08 08:55:23 | 2018-08-08 13:50:00 | 2018-08-17 18:06:29 | 2018-09-04 | -18.0 | 9.0 | 2018 | 0.0 |
| 3 | 949d5b44dbf5de918fe9c16f97b45f8a | f88197465ea7920adcdbec7375364d82 | delivered | 2017-11-18 19:28:06 | 2017-11-18 19:45:59 | 2017-11-22 13:39:59 | 2017-12-02 00:28:42 | 2017-12-15 | -13.0 | 13.0 | 2017 | 0.0 |
| 4 | ad21c59c0840e6cb83a9ceb5573f8159 | 8ab97904e6daea8866dbdbc4fb7aad2c | delivered | 2018-02-13 21:18:39 | 2018-02-13 22:20:29 | 2018-02-14 19:46:34 | 2018-02-16 18:17:02 | 2018-02-26 | -10.0 | 2.0 | 2018 | 1.0 |
class DeliverHoursDashboard(param.Parameterized):
stats = param.ObjectSelector(default='Original', objects=['Original', 'Outlier removed'])
plt.figure(figsize=(12,8), dpi= 100)
def after_boxplot(self):
plt.clf()
if self.stats == 'Original':
sns.boxplot(x=delivered['approval_hours'])
else:
sns.boxplot(x=orders_approval_hours['approval_hours'])
return plt.gcf()
def append_end(self):
plt.clf()
return ""
dh1 = DeliverHoursDashboard(name='')
dashboard = pn.Column('Delivery Hours Outlier Dashboard',
dh1.param,
dh1.after_boxplot,
dh1.append_end
)
dashboard.embed()
<Figure size 1728x1152 with 0 Axes>
It usually takes around 45 minutes for an order to be approved. This duration has increased in 2018.
order_payments = pd.read_csv("olist_order_payments_dataset.csv")
order_payments.head(3)
| order_id | payment_sequential | payment_type | payment_installments | payment_value | |
|---|---|---|---|---|---|
| 0 | b81ef226f3fe1789b1e8b2acac839d17 | 1 | credit_card | 8 | 99.33 |
| 1 | a9810da82917af2d9aefd1278f1dcfa0 | 1 | credit_card | 1 | 24.39 |
| 2 | 25e8ea4e93396b6fa0d3dd708e76c1bd | 1 | credit_card | 1 | 65.71 |
orders_approval_hours = orders_approval_hours.merge(order_payments, left_on='order_id', right_on='order_id')
orders_approval_hours.head(3)
| order_id | customer_id | order_status | order_purchase_timestamp | order_approved_at | order_delivered_carrier_date | order_delivered_customer_date | order_estimated_delivery_date | order_delay | order_duration | year | approval_hours | payment_sequential | payment_type | payment_installments | payment_value | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | e481f51cbdc54678b7cc49136f2d6af7 | 9ef432eb6251297304e76186b10a928d | delivered | 2017-10-02 10:56:33 | 2017-10-02 11:07:15 | 2017-10-04 19:55:00 | 2017-10-10 21:25:13 | 2017-10-18 | -8.0 | 8.0 | 2017 | 0.0 | 1 | credit_card | 1 | 18.12 |
| 1 | e481f51cbdc54678b7cc49136f2d6af7 | 9ef432eb6251297304e76186b10a928d | delivered | 2017-10-02 10:56:33 | 2017-10-02 11:07:15 | 2017-10-04 19:55:00 | 2017-10-10 21:25:13 | 2017-10-18 | -8.0 | 8.0 | 2017 | 0.0 | 3 | voucher | 1 | 2.00 |
| 2 | e481f51cbdc54678b7cc49136f2d6af7 | 9ef432eb6251297304e76186b10a928d | delivered | 2017-10-02 10:56:33 | 2017-10-02 11:07:15 | 2017-10-04 19:55:00 | 2017-10-10 21:25:13 | 2017-10-18 | -8.0 | 8.0 | 2017 | 0.0 | 2 | voucher | 1 | 18.59 |
class ApprovalPaymentDashboard(param.Parameterized):
stats = param.ObjectSelector(default='approval_hours', objects=['approval_hours', 'approval_hours by payment type'])
sns.set(style="darkgrid")
plt.figure(figsize=(12,8), dpi= 100)
def sns_plot(self):
plt.clf()
if self.stats == 'approval_hours':
splot = sns.barplot(x="year", y="approval_hours",data=orders_approval_hours, ci=None)
else:
splot = sns.barplot(x="year", y="approval_hours", hue="payment_type", data=orders_approval_hours, ci=None)
for p in splot.patches:
splot.annotate(format(p.get_height(), '.1f'),
(p.get_x() + p.get_width() / 2., p.get_height()),
ha = 'center', va = 'center',
xytext = (0, 9),
textcoords = 'offset points')
return plt.gcf()
def append_end(self):
plt.clf()
return ""
ap1 = ApprovalPaymentDashboard(name='')
dashboard = pn.Column('Payment Approval Dashboard',
ap1.param,
ap1.sns_plot,
ap1.append_end
)
dashboard.embed()
<Figure size 1728x1152 with 0 Axes>
payment_count = orders_approval_hours.groupby(['payment_type', 'year']).size().reset_index(name='count')
payment_count
| payment_type | year | count | |
|---|---|---|---|
| 0 | boleto | 2017 | 3707 |
| 1 | boleto | 2018 | 3753 |
| 2 | credit_card | 2017 | 32779 |
| 3 | credit_card | 2018 | 38201 |
| 4 | debit_card | 2017 | 345 |
| 5 | debit_card | 2018 | 896 |
| 6 | voucher | 2017 | 2657 |
| 7 | voucher | 2018 | 2177 |
Majority of the payments were made through credit card. Credit card was also the fastest to be approved. Boleta takes around 5 hours to be approved, followed by more than 50 minutes in debit cards. In fact, more customers are using Boleto in 2018.
sns.set(style="darkgrid")
plt.figure(figsize=(12,8), dpi= 100)
splot = sns.barplot(x="year", y="count", hue="payment_type", data=payment_count, ci=None)
for p in splot.patches:
splot.annotate(format(p.get_height(), '.0f'),
(p.get_x() + p.get_width() / 2., p.get_height()),
ha = 'center', va = 'center',
xytext = (0, 9),
textcoords = 'offset points')
plt.show()
approve_deliver_proportion = delivered.copy(deep=False)
approve_deliver_proportion['deliver_days'] = (delivered['order_delivered_customer_date'] - delivered['order_approved_at']).dt.components['days']
approve_deliver_proportion = remove_outlier_iqr(approve_deliver_proportion, 'order_duration')
approve_deliver_proportion = remove_outlier_iqr(approve_deliver_proportion, 'deliver_days')
approve_deliver_proportion.head(3)
| order_id | customer_id | order_status | order_purchase_timestamp | order_approved_at | order_delivered_carrier_date | order_delivered_customer_date | order_estimated_delivery_date | order_delay | order_duration | year | approval_hours | deliver_days | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | e481f51cbdc54678b7cc49136f2d6af7 | 9ef432eb6251297304e76186b10a928d | delivered | 2017-10-02 10:56:33 | 2017-10-02 11:07:15 | 2017-10-04 19:55:00 | 2017-10-10 21:25:13 | 2017-10-18 | -8.0 | 8.0 | 2017 | 0.0 | 8.0 |
| 1 | 53cdb2fc8bc7dce0b6741e2150273451 | b0830fb4747a6c6d20dea0b8c802d7ef | delivered | 2018-07-24 20:41:37 | 2018-07-26 03:24:27 | 2018-07-26 14:31:00 | 2018-08-07 15:27:45 | 2018-08-13 | -6.0 | 13.0 | 2018 | 6.0 | 12.0 |
| 2 | 47770eb9100c2d0c44946d9cf07ec65d | 41ce2a54c0b03bf3443c3d931a367089 | delivered | 2018-08-08 08:38:49 | 2018-08-08 08:55:23 | 2018-08-08 13:50:00 | 2018-08-17 18:06:29 | 2018-09-04 | -18.0 | 9.0 | 2018 | 0.0 | 9.0 |
approve_deliver_proportion['order_duration'].mean()
10.253967191161701
Although approval hours for payment method Boleto can take as much as 5 hours, majority of customer's waiting time goes into the delivery process.
sns.set(style="darkgrid")
plt.figure(figsize=(12,8), dpi= 100)
sns.barplot(x = approve_deliver_proportion['year'], y = approve_deliver_proportion['order_duration'], color = "red")
bottom_plot = sns.barplot(x = approve_deliver_proportion['year'], y = approve_deliver_proportion['deliver_days'], color = "#0000A3")
topbar = plt.Rectangle((0,0),1,1,fc="red", edgecolor = 'none')
bottombar = plt.Rectangle((0,0),1,1,fc='#0000A3', edgecolor = 'none')
l = plt.legend([bottombar, topbar], ['Delivery', 'Approval'])
products = pd.read_csv("olist_products_dataset.csv")
products.head(3)
| product_id | product_category_name | product_name_lenght | product_description_lenght | product_photos_qty | product_weight_g | product_length_cm | product_height_cm | product_width_cm | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | 1e9e8ef04dbcff4541ed26657ea517e5 | perfumaria | 40.0 | 287.0 | 1.0 | 225.0 | 16.0 | 10.0 | 14.0 |
| 1 | 3aa071139cb16b67ca9e5dea641aaa2f | artes | 44.0 | 276.0 | 1.0 | 1000.0 | 30.0 | 18.0 | 20.0 |
| 2 | 96bd76ec8810374ed1b65e291975717f | esporte_lazer | 46.0 | 250.0 | 1.0 | 154.0 | 18.0 | 9.0 | 15.0 |
msno.bar(products)
<AxesSubplot:>
product_volume = products.drop(['product_description_lenght', 'product_photos_qty', 'product_name_lenght'], axis = 1)
msno.bar(product_volume)
<AxesSubplot:>
Translating product category name
category_translation = pd.read_csv("product_category_name_translation.csv")
category_translation.head(3)
| product_category_name | product_category_name_english | |
|---|---|---|
| 0 | beleza_saude | health_beauty |
| 1 | informatica_acessorios | computers_accessories |
| 2 | automotivo | auto |
product_volume = pd.merge(product_volume, category_translation, how="left", on=["product_category_name"])
product_volume = product_volume.drop(['product_category_name'], axis = 1)
product_volume.head(3)
| product_id | product_weight_g | product_length_cm | product_height_cm | product_width_cm | product_category_name_english | |
|---|---|---|---|---|---|---|
| 0 | 1e9e8ef04dbcff4541ed26657ea517e5 | 225.0 | 16.0 | 10.0 | 14.0 | perfumery |
| 1 | 3aa071139cb16b67ca9e5dea641aaa2f | 1000.0 | 30.0 | 18.0 | 20.0 | art |
| 2 | 96bd76ec8810374ed1b65e291975717f | 154.0 | 18.0 | 9.0 | 15.0 | sports_leisure |
product_volume['product_category_name_english'].unique()
array(['perfumery', 'art', 'sports_leisure', 'baby', 'housewares',
'musical_instruments', 'cool_stuff', 'furniture_decor',
'home_appliances', 'toys', 'bed_bath_table',
'construction_tools_safety', 'computers_accessories',
'health_beauty', 'luggage_accessories', 'garden_tools',
'office_furniture', 'auto', 'electronics', 'fashion_shoes',
'telephony', 'stationery', 'fashion_bags_accessories', 'computers',
'home_construction', 'watches_gifts',
'construction_tools_construction', 'pet_shop', 'small_appliances',
'agro_industry_and_commerce', nan, 'furniture_living_room',
'signaling_and_security', 'air_conditioning', 'consoles_games',
'books_general_interest', 'costruction_tools_tools',
'fashion_underwear_beach', 'fashion_male_clothing',
'kitchen_dining_laundry_garden_furniture',
'industry_commerce_and_business', 'fixed_telephony',
'construction_tools_lights', 'books_technical',
'home_appliances_2', 'party_supplies', 'drinks', 'market_place',
'la_cuisine', 'costruction_tools_garden', 'fashio_female_clothing',
'home_confort', 'audio', 'food_drink', 'music', 'food',
'tablets_printing_image', 'books_imported',
'small_appliances_home_oven_and_coffee', 'fashion_sport',
'christmas_supplies', 'fashion_childrens_clothes', 'dvds_blu_ray',
'arts_and_craftmanship', 'furniture_bedroom', 'cine_photo',
'diapers_and_hygiene', 'flowers', 'home_comfort_2',
'security_and_services', 'furniture_mattress_and_upholstery',
'cds_dvds_musicals'], dtype=object)
There is no 'others' category, so for missing category names, they are considered 'others'
product_volume.fillna('others')
msno.bar(product_volume)
<AxesSubplot:>
product_volume['volume'] = product_volume['product_length_cm'] * product_volume['product_width_cm'] * product_volume['product_height_cm']
product_volume['weight'] = product_volume['product_weight_g'] / 1000
product_volume.head(3)
| product_id | product_weight_g | product_length_cm | product_height_cm | product_width_cm | product_category_name_english | volume | weight | |
|---|---|---|---|---|---|---|---|---|
| 0 | 1e9e8ef04dbcff4541ed26657ea517e5 | 225.0 | 16.0 | 10.0 | 14.0 | perfumery | 2240.0 | 0.225 |
| 1 | 3aa071139cb16b67ca9e5dea641aaa2f | 1000.0 | 30.0 | 18.0 | 20.0 | art | 10800.0 | 1.000 |
| 2 | 96bd76ec8810374ed1b65e291975717f | 154.0 | 18.0 | 9.0 | 15.0 | sports_leisure | 2430.0 | 0.154 |
product_volume.groupby('product_category_name_english').mean('volume').sort_values(by=['volume'], ascending=False)
| product_weight_g | product_length_cm | product_height_cm | product_width_cm | volume | weight | |
|---|---|---|---|---|---|---|
| product_category_name_english | ||||||
| furniture_mattress_and_upholstery | 13190.000000 | 46.300000 | 34.400000 | 41.300000 | 77244.300000 | 13.190000 |
| office_furniture | 12740.867314 | 55.627832 | 41.864078 | 37.919094 | 75468.469256 | 12.740867 |
| kitchen_dining_laundry_garden_furniture | 11598.563830 | 47.340426 | 40.478723 | 38.680851 | 69406.095745 | 11.598564 |
| home_appliances_2 | 9913.333333 | 45.733333 | 30.666667 | 38.166667 | 55476.311111 | 9.913333 |
| furniture_living_room | 8934.846154 | 50.730769 | 22.365385 | 44.429487 | 54486.128205 | 8.934846 |
| ... | ... | ... | ... | ... | ... | ... |
| watches_gifts | 509.287434 | 19.222724 | 10.292701 | 15.268623 | 3470.398044 | 0.509287 |
| books_technical | 1107.845528 | 27.325203 | 5.869919 | 18.463415 | 2758.991870 | 1.107846 |
| books_imported | 596.774194 | 29.741935 | 3.451613 | 21.225806 | 1935.387097 | 0.596774 |
| telephony | 236.506173 | 18.432981 | 6.853616 | 13.248677 | 1865.841270 | 0.236506 |
| dvds_blu_ray | 381.562500 | 21.270833 | 4.416667 | 14.875000 | 1746.854167 | 0.381562 |
71 rows × 6 columns
The heaviest and biggest product categories are furniture.
order_items.head(3)
| order_id | order_item_id | product_id | seller_id | shipping_limit_date | price | freight_value | |
|---|---|---|---|---|---|---|---|
| 0 | 00010242fe8c5a6d1ba2dd792cb16214 | 1 | 4244733e06e7ecb4970a6e2683c13e61 | 48436dade18ac8b2bce089ec2a041202 | 2017-09-19 09:45:35 | 58.9 | 13.29 |
| 1 | 00018f77f2f0320c557190d7a144bdd3 | 1 | e5f2d52b802189ee658865ca93d83a8f | dd7ddc04e1b6c2c614352b383efe2d36 | 2017-05-03 11:05:13 | 239.9 | 19.93 |
| 2 | 000229ec398224ef6ca0657da4fc703e | 1 | c777355d18b72b67abbeef9df44fd0fd | 5b51032eddd242adc84c38acab88f23d | 2018-01-18 14:48:30 | 199.0 | 17.87 |
order_weight_volume = pd.merge(delivered, order_items, how="left", on=["order_id"])
order_weight_volume = order_weight_volume.filter(['order_id', 'product_id'])
order_weight_volume.head(3)
| order_id | product_id | |
|---|---|---|
| 0 | e481f51cbdc54678b7cc49136f2d6af7 | 87285b34884572647811a353c7ac498a |
| 1 | 53cdb2fc8bc7dce0b6741e2150273451 | 595fac2a385ac33a80bd5114aec74eb8 |
| 2 | 47770eb9100c2d0c44946d9cf07ec65d | aa4383b373c6aca5d8797843e5594415 |
order_weight_volume = pd.merge(order_weight_volume, product_volume, how="left", on=["product_id"])
order_weight_volume = order_weight_volume.drop(columns=['product_weight_g', 'product_length_cm', 'product_width_cm', 'product_height_cm', 'product_id', 'product_category_name_english'])
order_weight_volume.head(3)
| order_id | volume | weight | |
|---|---|---|---|
| 0 | e481f51cbdc54678b7cc49136f2d6af7 | 1976.0 | 0.50 |
| 1 | 53cdb2fc8bc7dce0b6741e2150273451 | 4693.0 | 0.40 |
| 2 | 47770eb9100c2d0c44946d9cf07ec65d | 9576.0 | 0.42 |
order_weight_volume = order_weight_volume.groupby(['order_id']).sum(['volume', 'weight'])
order_weight_volume.head(3)
| volume | weight | |
|---|---|---|
| order_id | ||
| 00010242fe8c5a6d1ba2dd792cb16214 | 3528.0 | 0.65 |
| 00018f77f2f0320c557190d7a144bdd3 | 60000.0 | 30.00 |
| 000229ec398224ef6ca0657da4fc703e | 14157.0 | 3.05 |
order_weight_volume = pd.merge(order_weight_volume, delivered, how="left", on=["order_id"])
order_weight_volume.head(3)
| order_id | volume | weight | customer_id | order_status | order_purchase_timestamp | order_approved_at | order_delivered_carrier_date | order_delivered_customer_date | order_estimated_delivery_date | order_delay | order_duration | year | approval_hours | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 00010242fe8c5a6d1ba2dd792cb16214 | 3528.0 | 0.65 | 3ce436f183e68e07877b285a838db11a | delivered | 2017-09-13 08:59:02 | 2017-09-13 09:45:35 | 2017-09-19 18:34:16 | 2017-09-20 23:43:48 | 2017-09-29 | -9.0 | 7.0 | 2017 | 0.0 |
| 1 | 00018f77f2f0320c557190d7a144bdd3 | 60000.0 | 30.00 | f6dd3ec061db4e3987629fe6b26e5cce | delivered | 2017-04-26 10:53:06 | 2017-04-26 11:05:13 | 2017-05-04 14:35:00 | 2017-05-12 16:04:24 | 2017-05-15 | -3.0 | 16.0 | 2017 | 0.0 |
| 2 | 000229ec398224ef6ca0657da4fc703e | 14157.0 | 3.05 | 6489ae5e4333f3693df5ad4372dab6d3 | delivered | 2018-01-14 14:33:31 | 2018-01-14 14:48:30 | 2018-01-16 12:36:48 | 2018-01-22 13:19:16 | 2018-02-05 | -14.0 | 7.0 | 2018 | 0.0 |
class OrderWeightDashboard(param.Parameterized):
stats = param.ObjectSelector(default='volume', objects=['volume', 'weight'])
plt.figure(figsize=(12,8), dpi= 100)
def before_boxplot(self):
plt.clf()
sns.boxplot(x=order_weight_volume[self.stats])
return plt.gcf()
def after_boxplot(self):
plt.clf()
order_weight_volume_after = remove_outlier_iqr(order_weight_volume, self.stats)
sns.boxplot(x=order_weight_volume_after[self.stats])
return plt.gcf()
def append_end(self):
plt.clf()
return ""
ow1 = OrderWeightDashboard(name='')
dashboard = pn.Column('Order Weight Outlier Dashboard',
ow1.param,
pn.Tabs(
('Original', ow1.before_boxplot),
('Outlier removed', ow1.after_boxplot),
('',ow1.append_end)
)
)
dashboard.embed()
<Figure size 1728x1152 with 0 Axes>
order_weight_volume_corr = order_weight_volume.filter(['volume', 'weight', 'order_duration', 'approval_hours', 'order_delay', 'price', 'freight_value']).corr()
plt.figure(figsize=(12,8), dpi= 100)
sns.heatmap(order_weight_volume_corr, xticklabels=order_weight_volume_corr.columns, yticklabels=order_weight_volume_corr.columns, annot=True)
<AxesSubplot:>
Order weight and volume are not correlated with order delivery.
geo = pd.read_csv("olist_geolocation_dataset.csv")
geo.head(3)
| geolocation_zip_code_prefix | geolocation_lat | geolocation_lng | geolocation_city | geolocation_state | |
|---|---|---|---|---|---|
| 0 | 1037 | -23.545621 | -46.639292 | sao paulo | SP |
| 1 | 1046 | -23.546081 | -46.644820 | sao paulo | SP |
| 2 | 1046 | -23.546129 | -46.642951 | sao paulo | SP |
avarage_location = geo.groupby('geolocation_state').mean(['geolocation_lat', 'geolocation_lng'])
avarage_location['geolocation_state'] = avarage_location.index
avarage_location.head(3)
| geolocation_zip_code_prefix | geolocation_lat | geolocation_lng | geolocation_state | |
|---|---|---|---|---|
| geolocation_state | ||||
| AC | 69798.877786 | -9.702555 | -68.451852 | AC |
| AL | 57234.456371 | -9.599729 | -36.052017 | AL |
| AM | 69142.245888 | -3.349336 | -60.537430 | AM |
customers = pd.read_csv("olist_customers_dataset.csv")
customers.head(3)
| customer_id | customer_unique_id | customer_zip_code_prefix | customer_city | customer_state | |
|---|---|---|---|---|---|
| 0 | 06b8999e2fba1a1fbc88172c00ba8bc7 | 861eff4711a542e4b93843c6dd7febb0 | 14409 | franca | SP |
| 1 | 18955e83d337fd6b2def6b18a428ac77 | 290c77bc529b7ac935b93aa66c333dc3 | 9790 | sao bernardo do campo | SP |
| 2 | 4e7b3e00288586ebd08712fdd0374a03 | 060e732b5b29e8181a18229c7b0b2b5e | 1151 | sao paulo | SP |
msno.bar(customers)
<AxesSubplot:>
customer_state_geo = customers.merge(avarage_location.rename(columns={'geolocation_state':'customer_state'}),how='outer')
customer_state_geo.head(3)
| customer_id | customer_unique_id | customer_zip_code_prefix | customer_city | customer_state | geolocation_zip_code_prefix | geolocation_lat | geolocation_lng | |
|---|---|---|---|---|---|---|---|---|
| 0 | 06b8999e2fba1a1fbc88172c00ba8bc7 | 861eff4711a542e4b93843c6dd7febb0 | 14409 | franca | SP | 9054.286003 | -23.155308 | -47.084074 |
| 1 | 18955e83d337fd6b2def6b18a428ac77 | 290c77bc529b7ac935b93aa66c333dc3 | 9790 | sao bernardo do campo | SP | 9054.286003 | -23.155308 | -47.084074 |
| 2 | 4e7b3e00288586ebd08712fdd0374a03 | 060e732b5b29e8181a18229c7b0b2b5e | 1151 | sao paulo | SP | 9054.286003 | -23.155308 | -47.084074 |
import plotly.offline as pyo
import plotly.graph_objs as go
pyo.init_notebook_mode()
import plotly.express as px
fig = px.scatter_mapbox(customer_state_geo, lat="geolocation_lat", lon="geolocation_lng")
fig.update_layout(mapbox_style="open-street-map")
fig.show()
customer_state_density = avarage_location[avarage_location['geolocation_state'].isin(customer_state_geo['customer_state'])]
customer_state_density['count'] = customer_state_geo['customer_state'].value_counts()
customer_state_density
| geolocation_zip_code_prefix | geolocation_lat | geolocation_lng | geolocation_state | count | |
|---|---|---|---|---|---|
| geolocation_state | |||||
| AC | 69798.877786 | -9.702555 | -68.451852 | AC | 81 |
| AL | 57234.456371 | -9.599729 | -36.052017 | AL | 413 |
| AM | 69142.245888 | -3.349336 | -60.537430 | AM | 148 |
| AP | 68911.880422 | 0.086025 | -51.234304 | AP | 68 |
| BA | 44289.646858 | -13.049361 | -39.560649 | BA | 3380 |
| CE | 61662.827908 | -4.363151 | -39.004140 | CE | 1336 |
| DF | 71592.770137 | -15.810885 | -47.969630 | DF | 2140 |
| ES | 29327.859864 | -20.105145 | -40.503183 | ES | 2033 |
| GO | 75028.224440 | -16.577645 | -49.334195 | GO | 2020 |
| MA | 65411.114988 | -3.798997 | -44.818627 | MA | 747 |
| MG | 35354.955072 | -19.864647 | -44.421615 | MG | 11635 |
| MS | 79363.061068 | -20.765006 | -54.532140 | MS | 715 |
| MT | 78435.385670 | -14.156482 | -55.708956 | MT | 907 |
| PA | 67525.525569 | -2.631213 | -49.485862 | PA | 975 |
| PB | 58324.324305 | -7.088298 | -35.821678 | PB | 536 |
| PE | 53687.958678 | -8.179098 | -35.758866 | PE | 1652 |
| PI | 64324.879974 | -5.754989 | -42.509541 | PI | 495 |
| PR | 84328.662991 | -24.793607 | -50.879662 | PR | 5045 |
| RJ | 24002.060090 | -22.743477 | -43.155540 | RJ | 12852 |
| RN | 59280.773458 | -5.856702 | -35.990079 | RN | 485 |
| RO | 76885.564980 | -10.341289 | -62.720579 | RO | 253 |
| RR | 69315.043344 | 2.717100 | -60.672866 | RR | 46 |
| RS | 94996.301709 | -29.679191 | -52.032652 | RS | 5466 |
| SC | 88855.220492 | -27.222486 | -49.617937 | SC | 3637 |
| SE | 49197.524839 | -10.866199 | -37.181169 | SE | 350 |
| SP | 9054.286003 | -23.155308 | -47.084074 | SP | 41746 |
| TO | 77454.112975 | -9.503700 | -48.348661 | TO | 280 |
fig = px.density_mapbox(customer_state_density, lat='geolocation_lat', lon='geolocation_lng', z='count', radius=20,
center=dict(lat=0, lon=100), zoom=0,
mapbox_style="stamen-terrain")
fig.show()
Majority of the customers live in Sao Paulo.
sellers = pd.read_csv("olist_sellers_dataset.csv")
sellers.head(3)
| seller_id | seller_zip_code_prefix | seller_city | seller_state | |
|---|---|---|---|---|
| 0 | 3442f8959a84dea7ee197c632cb2df15 | 13023 | campinas | SP |
| 1 | d1b65fc7debc3361ea86b5f14c68d2e2 | 13844 | mogi guacu | SP |
| 2 | ce3ad9de960102d0677a81f5d0bb7b2d | 20031 | rio de janeiro | RJ |
msno.bar(sellers)
<AxesSubplot:>
seller_state_geo = sellers.merge(avarage_location.rename(columns={'geolocation_state':'seller_state'}),how='outer')
seller_state_geo.head(3)
| seller_id | seller_zip_code_prefix | seller_city | seller_state | geolocation_zip_code_prefix | geolocation_lat | geolocation_lng | |
|---|---|---|---|---|---|---|---|
| 0 | 3442f8959a84dea7ee197c632cb2df15 | 13023.0 | campinas | SP | 9054.286003 | -23.155308 | -47.084074 |
| 1 | d1b65fc7debc3361ea86b5f14c68d2e2 | 13844.0 | mogi guacu | SP | 9054.286003 | -23.155308 | -47.084074 |
| 2 | c0f3eea2e14555b6faeea3dd58c1b1c3 | 4195.0 | sao paulo | SP | 9054.286003 | -23.155308 | -47.084074 |
fig = px.scatter_mapbox(seller_state_geo, lat="geolocation_lat", lon="geolocation_lng")
fig.update_layout(mapbox_style="open-street-map")
fig.show()
avarage_location
| geolocation_zip_code_prefix | geolocation_lat | geolocation_lng | geolocation_state | |
|---|---|---|---|---|
| geolocation_state | ||||
| AC | 69798.877786 | -9.702555 | -68.451852 | AC |
| AL | 57234.456371 | -9.599729 | -36.052017 | AL |
| AM | 69142.245888 | -3.349336 | -60.537430 | AM |
| AP | 68911.880422 | 0.086025 | -51.234304 | AP |
| BA | 44289.646858 | -13.049361 | -39.560649 | BA |
| CE | 61662.827908 | -4.363151 | -39.004140 | CE |
| DF | 71592.770137 | -15.810885 | -47.969630 | DF |
| ES | 29327.859864 | -20.105145 | -40.503183 | ES |
| GO | 75028.224440 | -16.577645 | -49.334195 | GO |
| MA | 65411.114988 | -3.798997 | -44.818627 | MA |
| MG | 35354.955072 | -19.864647 | -44.421615 | MG |
| MS | 79363.061068 | -20.765006 | -54.532140 | MS |
| MT | 78435.385670 | -14.156482 | -55.708956 | MT |
| PA | 67525.525569 | -2.631213 | -49.485862 | PA |
| PB | 58324.324305 | -7.088298 | -35.821678 | PB |
| PE | 53687.958678 | -8.179098 | -35.758866 | PE |
| PI | 64324.879974 | -5.754989 | -42.509541 | PI |
| PR | 84328.662991 | -24.793607 | -50.879662 | PR |
| RJ | 24002.060090 | -22.743477 | -43.155540 | RJ |
| RN | 59280.773458 | -5.856702 | -35.990079 | RN |
| RO | 76885.564980 | -10.341289 | -62.720579 | RO |
| RR | 69315.043344 | 2.717100 | -60.672866 | RR |
| RS | 94996.301709 | -29.679191 | -52.032652 | RS |
| SC | 88855.220492 | -27.222486 | -49.617937 | SC |
| SE | 49197.524839 | -10.866199 | -37.181169 | SE |
| SP | 9054.286003 | -23.155308 | -47.084074 | SP |
| TO | 77454.112975 | -9.503700 | -48.348661 | TO |
seller_state_density = avarage_location[avarage_location['geolocation_state'].isin(seller_state_geo['seller_state'])]
seller_state_density['count'] = seller_state_geo['seller_state'].value_counts()
seller_state_density
| geolocation_zip_code_prefix | geolocation_lat | geolocation_lng | geolocation_state | count | |
|---|---|---|---|---|---|
| geolocation_state | |||||
| AC | 69798.877786 | -9.702555 | -68.451852 | AC | 1 |
| AL | 57234.456371 | -9.599729 | -36.052017 | AL | 1 |
| AM | 69142.245888 | -3.349336 | -60.537430 | AM | 1 |
| AP | 68911.880422 | 0.086025 | -51.234304 | AP | 1 |
| BA | 44289.646858 | -13.049361 | -39.560649 | BA | 19 |
| CE | 61662.827908 | -4.363151 | -39.004140 | CE | 13 |
| DF | 71592.770137 | -15.810885 | -47.969630 | DF | 30 |
| ES | 29327.859864 | -20.105145 | -40.503183 | ES | 23 |
| GO | 75028.224440 | -16.577645 | -49.334195 | GO | 40 |
| MA | 65411.114988 | -3.798997 | -44.818627 | MA | 1 |
| MG | 35354.955072 | -19.864647 | -44.421615 | MG | 244 |
| MS | 79363.061068 | -20.765006 | -54.532140 | MS | 5 |
| MT | 78435.385670 | -14.156482 | -55.708956 | MT | 4 |
| PA | 67525.525569 | -2.631213 | -49.485862 | PA | 1 |
| PB | 58324.324305 | -7.088298 | -35.821678 | PB | 6 |
| PE | 53687.958678 | -8.179098 | -35.758866 | PE | 9 |
| PI | 64324.879974 | -5.754989 | -42.509541 | PI | 1 |
| PR | 84328.662991 | -24.793607 | -50.879662 | PR | 349 |
| RJ | 24002.060090 | -22.743477 | -43.155540 | RJ | 171 |
| RN | 59280.773458 | -5.856702 | -35.990079 | RN | 5 |
| RO | 76885.564980 | -10.341289 | -62.720579 | RO | 2 |
| RR | 69315.043344 | 2.717100 | -60.672866 | RR | 1 |
| RS | 94996.301709 | -29.679191 | -52.032652 | RS | 129 |
| SC | 88855.220492 | -27.222486 | -49.617937 | SC | 190 |
| SE | 49197.524839 | -10.866199 | -37.181169 | SE | 2 |
| SP | 9054.286003 | -23.155308 | -47.084074 | SP | 1849 |
| TO | 77454.112975 | -9.503700 | -48.348661 | TO | 1 |
fig = px.density_mapbox(seller_state_density, lat='geolocation_lat', lon='geolocation_lng', z='count', radius=20,
center=dict(lat=0, lon=100), zoom=0,
mapbox_style="stamen-terrain")
fig.show()
Majority of the sellers also live in Sao Paulo
from math import sin, cos, sqrt, atan2, radians
def get_distance(lat1, lat2, lon1, lon2):
# approximate radius of earth in km
R = 6373.0
dlon = lon2 - lon1
dlat = lat2 - lat1
a = sin(dlat / 2)**2 + cos(lat1) * cos(lat2) * sin(dlon / 2)**2
c = 2 * atan2(sqrt(a), sqrt(1 - a))
return(R * c)
Customized function to get distance between 2 coordinates
order_distance = pd.merge(delivered, customer_state_geo.filter(['customer_id', 'geolocation_lat', 'geolocation_lng']), how="left", on=["customer_id"])
order_distance = order_distance.rename(columns={'geolocation_lat': 'customer_lat', 'geolocation_lng': 'customer_lng'})
order_distance.head(3)
| order_id | customer_id | order_status | order_purchase_timestamp | order_approved_at | order_delivered_carrier_date | order_delivered_customer_date | order_estimated_delivery_date | order_delay | order_duration | year | approval_hours | customer_lat | customer_lng | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | e481f51cbdc54678b7cc49136f2d6af7 | 9ef432eb6251297304e76186b10a928d | delivered | 2017-10-02 10:56:33 | 2017-10-02 11:07:15 | 2017-10-04 19:55:00 | 2017-10-10 21:25:13 | 2017-10-18 | -8.0 | 8.0 | 2017 | 0.0 | -23.155308 | -47.084074 |
| 1 | 53cdb2fc8bc7dce0b6741e2150273451 | b0830fb4747a6c6d20dea0b8c802d7ef | delivered | 2018-07-24 20:41:37 | 2018-07-26 03:24:27 | 2018-07-26 14:31:00 | 2018-08-07 15:27:45 | 2018-08-13 | -6.0 | 13.0 | 2018 | 6.0 | -13.049361 | -39.560649 |
| 2 | 47770eb9100c2d0c44946d9cf07ec65d | 41ce2a54c0b03bf3443c3d931a367089 | delivered | 2018-08-08 08:38:49 | 2018-08-08 08:55:23 | 2018-08-08 13:50:00 | 2018-08-17 18:06:29 | 2018-09-04 | -18.0 | 9.0 | 2018 | 0.0 | -16.577645 | -49.334195 |
order_distance = pd.merge(order_distance, order_items.filter(['order_id', 'seller_id']), how="left", on=["order_id"])
order_distance = pd.merge(order_distance, seller_state_geo.filter(['seller_id', 'geolocation_lat', 'geolocation_lng']), how="left", on=["seller_id"])
order_distance = order_distance.rename(columns={'geolocation_lat': 'seller_lat', 'geolocation_lng': 'seller_lng'})
order_distance.head(3)
| order_id | customer_id | order_status | order_purchase_timestamp | order_approved_at | order_delivered_carrier_date | order_delivered_customer_date | order_estimated_delivery_date | order_delay | order_duration | year | approval_hours | customer_lat | customer_lng | seller_id | seller_lat | seller_lng | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | e481f51cbdc54678b7cc49136f2d6af7 | 9ef432eb6251297304e76186b10a928d | delivered | 2017-10-02 10:56:33 | 2017-10-02 11:07:15 | 2017-10-04 19:55:00 | 2017-10-10 21:25:13 | 2017-10-18 | -8.0 | 8.0 | 2017 | 0.0 | -23.155308 | -47.084074 | 3504c0cb71d7fa48d967e0e4c94d59d9 | -23.155308 | -47.084074 |
| 1 | 53cdb2fc8bc7dce0b6741e2150273451 | b0830fb4747a6c6d20dea0b8c802d7ef | delivered | 2018-07-24 20:41:37 | 2018-07-26 03:24:27 | 2018-07-26 14:31:00 | 2018-08-07 15:27:45 | 2018-08-13 | -6.0 | 13.0 | 2018 | 6.0 | -13.049361 | -39.560649 | 289cdb325fb7e7f891c38608bf9e0962 | -23.155308 | -47.084074 |
| 2 | 47770eb9100c2d0c44946d9cf07ec65d | 41ce2a54c0b03bf3443c3d931a367089 | delivered | 2018-08-08 08:38:49 | 2018-08-08 08:55:23 | 2018-08-08 13:50:00 | 2018-08-17 18:06:29 | 2018-09-04 | -18.0 | 9.0 | 2018 | 0.0 | -16.577645 | -49.334195 | 4869f7a5dfa277a7dca6462dcf3b52b2 | -23.155308 | -47.084074 |
order_distance['order_distance'] = order_distance.apply(lambda x: get_distance(x['customer_lat'], x['customer_lng'], x['seller_lat'], x['seller_lng']), axis=1)
order_distance.head(3)
| order_id | customer_id | order_status | order_purchase_timestamp | order_approved_at | order_delivered_carrier_date | order_delivered_customer_date | order_estimated_delivery_date | order_delay | order_duration | year | approval_hours | customer_lat | customer_lng | seller_id | seller_lat | seller_lng | order_distance | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | e481f51cbdc54678b7cc49136f2d6af7 | 9ef432eb6251297304e76186b10a928d | delivered | 2017-10-02 10:56:33 | 2017-10-02 11:07:15 | 2017-10-04 19:55:00 | 2017-10-10 21:25:13 | 2017-10-18 | -8.0 | 8.0 | 2017 | 0.0 | -23.155308 | -47.084074 | 3504c0cb71d7fa48d967e0e4c94d59d9 | -23.155308 | -47.084074 | 9339.120177 |
| 1 | 53cdb2fc8bc7dce0b6741e2150273451 | b0830fb4747a6c6d20dea0b8c802d7ef | delivered | 2018-07-24 20:41:37 | 2018-07-26 03:24:27 | 2018-07-26 14:31:00 | 2018-08-07 15:27:45 | 2018-08-13 | -6.0 | 13.0 | 2018 | 6.0 | -13.049361 | -39.560649 | 289cdb325fb7e7f891c38608bf9e0962 | -23.155308 | -47.084074 | 7705.414246 |
| 2 | 47770eb9100c2d0c44946d9cf07ec65d | 41ce2a54c0b03bf3443c3d931a367089 | delivered | 2018-08-08 08:38:49 | 2018-08-08 08:55:23 | 2018-08-08 13:50:00 | 2018-08-17 18:06:29 | 2018-09-04 | -18.0 | 9.0 | 2018 | 0.0 | -16.577645 | -49.334195 | 4869f7a5dfa277a7dca6462dcf3b52b2 | -23.155308 | -47.084074 | 6855.903768 |
class OrderDistanceDashboard(param.Parameterized):
stats = param.ObjectSelector(default='Original', objects=['Original', 'Outlier removed'])
plt.figure(figsize=(12,8), dpi= 100)
def after_boxplot(self):
plt.clf()
if self.stats == 'Original':
sns.boxplot(x=order_distance['order_distance'])
else:
order_no_outlier = remove_outlier_iqr(order_distance, 'order_distance')
sns.boxplot(x=order_no_outlier['order_distance'])
return plt.gcf()
def append_end(self):
plt.clf()
return ""
od1 = OrderPriceDashboard(name='')
dashboard = pn.Column('Order Distance Outlier Dashboard',
od1.param,
od1.after_boxplot,
od1.append_end
)
dashboard.embed()
<Figure size 1728x1152 with 0 Axes>
order_distance_corr = order_distance.filter(['order_distance', 'order_duration', 'approval_hours', 'order_delay']).corr()
plt.figure(figsize=(12,8), dpi= 100)
sns.heatmap(order_distance_corr, xticklabels=order_distance_corr.columns, yticklabels=order_distance_corr.columns, annot=True)
<AxesSubplot:>
Distance is not a reliable predictor for delivery time taken