import pandas as pd
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
pd.set_option('display.max_columns', 500)
df=pd.read_excel('data/SuperstoreSales.xlsx')
df.head(2)
Order ID | Order Date | Order Priority | Order Quantity | Sales | Discount | Ship Mode | Profit | Unit Price | Shipping Cost | Customer Name | Customer State | Zip Code | Region | Customer Segment | Product Category | Product Sub-Category | Product Name | Product Container | Product Base Margin | Ship Date | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 3 | 2010-08-26 | Low | 6 | 261.54 | 0.04 | Regular Air | -213.25 | 38.94 | 35.00 | Muhammed MacIntyre | New Jersey | 7514 | East | Small Business | Office Supplies | Storage & Organization | Eldon Base for stackable storage shelf, platinum | Large Box | 0.80 | 2010-09-02 |
1 | 6 | 2012-01-03 | Not Specified | 2 | 6.93 | 0.01 | Regular Air | -4.64 | 2.08 | 2.56 | Ruben Dartt | New York | 10457 | East | Corporate | Office Supplies | Scissors, Rulers and Trimmers | Kleencut® Forged Office Shears by Acme United ... | Small Pack | 0.55 | 2012-01-04 |
# check column names, number of observations, data type
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 8399 entries, 0 to 8398 Data columns (total 21 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Order ID 8399 non-null int64 1 Order Date 8399 non-null datetime64[ns] 2 Order Priority 8399 non-null object 3 Order Quantity 8399 non-null int64 4 Sales 8399 non-null float64 5 Discount 8399 non-null float64 6 Ship Mode 8399 non-null object 7 Profit 8399 non-null float64 8 Unit Price 8399 non-null float64 9 Shipping Cost 8399 non-null float64 10 Customer Name 8399 non-null object 11 Customer State 8399 non-null object 12 Zip Code 8399 non-null int64 13 Region 8399 non-null object 14 Customer Segment 8399 non-null object 15 Product Category 8399 non-null object 16 Product Sub-Category 8399 non-null object 17 Product Name 8399 non-null object 18 Product Container 8399 non-null object 19 Product Base Margin 8399 non-null float64 20 Ship Date 8399 non-null datetime64[ns] dtypes: datetime64[ns](2), float64(6), int64(3), object(10) memory usage: 1.3+ MB
df.columns=df.columns.str.replace(' ', '')
df.columns=df.columns.str.lower()
df.head(2)
orderid | orderdate | orderpriority | orderquantity | sales | discount | shipmode | profit | unitprice | shippingcost | customername | customerstate | zipcode | region | customersegment | productcategory | productsub-category | productname | productcontainer | productbasemargin | shipdate | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 3 | 2010-08-26 | Low | 6 | 261.54 | 0.04 | Regular Air | -213.25 | 38.94 | 35.00 | Muhammed MacIntyre | New Jersey | 7514 | East | Small Business | Office Supplies | Storage & Organization | Eldon Base for stackable storage shelf, platinum | Large Box | 0.80 | 2010-09-02 |
1 | 6 | 2012-01-03 | Not Specified | 2 | 6.93 | 0.01 | Regular Air | -4.64 | 2.08 | 2.56 | Ruben Dartt | New York | 10457 | East | Corporate | Office Supplies | Scissors, Rulers and Trimmers | Kleencut® Forged Office Shears by Acme United ... | Small Pack | 0.55 | 2012-01-04 |
df.columns
Index(['orderid', 'orderdate', 'orderpriority', 'orderquantity', 'sales', 'discount', 'shipmode', 'profit', 'unitprice', 'shippingcost', 'customername', 'customerstate', 'zipcode', 'region', 'customersegment', 'productcategory', 'productsub-category', 'productname', 'productcontainer', 'productbasemargin', 'shipdate'], dtype='object')
# check missing values
df.isnull().sum()
orderid 0 orderdate 0 orderpriority 0 orderquantity 0 sales 0 discount 0 shipmode 0 profit 0 unitprice 0 shippingcost 0 customername 0 customerstate 0 zipcode 0 region 0 customersegment 0 productcategory 0 productsub-category 0 productname 0 productcontainer 0 productbasemargin 0 shipdate 0 dtype: int64
# summary statistics
df.describe()
orderid | orderquantity | sales | discount | profit | unitprice | shippingcost | zipcode | productbasemargin | |
---|---|---|---|---|---|---|---|---|---|
count | 8399.000000 | 8399.000000 | 8399.000000 | 8399.000000 | 8399.000000 | 8399.000000 | 8399.000000 | 8399.000000 | 8399.000000 |
mean | 29965.179783 | 25.575426 | 1774.860490 | 0.049664 | 181.401194 | 89.310426 | 12.856427 | 48256.097631 | 0.512886 |
std | 17260.883447 | 14.482837 | 3585.107345 | 0.031826 | 1224.670845 | 290.361311 | 17.318257 | 30689.098271 | 0.135991 |
min | 3.000000 | 1.000000 | 2.240000 | 0.000000 | -16253.680000 | 0.990000 | 0.490000 | 501.000000 | 0.350000 |
25% | 15011.500000 | 13.000000 | 143.035000 | 0.020000 | -82.764250 | 6.480000 | 3.300000 | 21794.000000 | 0.380000 |
50% | 29857.000000 | 26.000000 | 449.420000 | 0.050000 | -1.490000 | 20.990000 | 6.070000 | 43317.000000 | 0.520000 |
75% | 44596.000000 | 38.000000 | 1706.285000 | 0.080000 | 166.712250 | 85.990000 | 13.990000 | 81221.000000 | 0.590000 |
max | 59973.000000 | 50.000000 | 89061.050000 | 0.250000 | 27220.690000 | 6783.020000 | 164.730000 | 99403.000000 | 0.850000 |
# index is like primary key in database (unique number assigned to each observation)
# check index column
df.index
RangeIndex(start=0, stop=8399, step=1)
# then set datetime [date] column as index (primary key). This is for time series analysis below.
# orderdate
df=df.set_index('orderdate')
df.head(2)
orderid | orderpriority | orderquantity | sales | discount | shipmode | profit | unitprice | shippingcost | customername | customerstate | zipcode | region | customersegment | productcategory | productsub-category | productname | productcontainer | productbasemargin | shipdate | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
orderdate | ||||||||||||||||||||
2010-08-26 | 3 | Low | 6 | 261.54 | 0.04 | Regular Air | -213.25 | 38.94 | 35.00 | Muhammed MacIntyre | New Jersey | 7514 | East | Small Business | Office Supplies | Storage & Organization | Eldon Base for stackable storage shelf, platinum | Large Box | 0.80 | 2010-09-02 |
2012-01-03 | 6 | Not Specified | 2 | 6.93 | 0.01 | Regular Air | -4.64 | 2.08 | 2.56 | Ruben Dartt | New York | 10457 | East | Corporate | Office Supplies | Scissors, Rulers and Trimmers | Kleencut® Forged Office Shears by Acme United ... | Small Pack | 0.55 | 2012-01-04 |
df['sales'].sum()
14907053.254
df['profit'].sum()
1523588.6295
month=df.resample('M').size()
month.head()
orderdate 2008-11-30 141 2008-12-31 182 2009-01-31 153 2009-02-28 186 2009-03-31 177 Freq: M, dtype: int64
monthsum=df.resample('M').sum()
monthsum.head()
orderid | orderquantity | sales | discount | profit | unitprice | shippingcost | zipcode | productbasemargin | |
---|---|---|---|---|---|---|---|---|---|
orderdate | |||||||||
2008-11-30 | 3861057 | 3352 | 303089.5465 | 6.76 | 23256.1785 | 19202.58 | 1946.95 | 7224279 | 73.16 |
2008-12-31 | 5915189 | 4814 | 427854.4250 | 9.11 | 45477.2050 | 20256.07 | 2848.97 | 9539552 | 92.05 |
2009-01-31 | 4709069 | 3581 | 295281.2650 | 7.53 | -6656.1085 | 23614.82 | 2531.44 | 6451957 | 81.04 |
2009-02-28 | 5500425 | 4705 | 448367.6080 | 9.37 | 63989.8160 | 28162.80 | 2296.74 | 8956195 | 94.45 |
2009-03-31 | 5152652 | 4738 | 297821.1905 | 9.20 | 25337.5000 | 11748.30 | 2212.40 | 9839537 | 90.88 |
plt.figure(figsize=[10,6])
plt.plot(monthsum['sales'])
[<matplotlib.lines.Line2D at 0x2a6509c6f10>]
plt.figure(figsize=[10,6])
df.resample('M').size().plot(kind='bar')
<AxesSubplot:xlabel='orderdate'>
#which month with the highest sales? October is that month (make this a mark down)
df.groupby(df.index.strftime('%B'))['sales'].size().sort_values(ascending=False)
orderdate October 753 July 738 August 731 April 723 March 715 June 709 December 703 November 699 January 676 February 658 September 650 May 644 Name: sales, dtype: int64
#Which month has the highest profit?
df.groupby(df.index.strftime('%B'))['profit'].sum().sort_values(ascending=False)
orderdate August 167858.6745 November 155983.4200 September 147600.0670 October 137874.7760 February 125970.5260 May 124034.5890 April 121924.6240 March 117642.5065 December 113894.4390 July 108811.6365 January 104597.5425 June 97395.8285 Name: profit, dtype: float64
df.groupby(df.index.strftime('%B'))['shippingcost'].sum().sort_values(ascending=False)
orderdate October 10321.00 November 10005.58 July 9669.23 December 9606.14 June 9344.89 January 8848.27 March 8776.34 September 8528.43 August 8517.75 April 8400.05 February 8288.41 May 7675.04 Name: shippingcost, dtype: float64
df.groupby(df.index.strftime('%B'))['shippingcost'].sum().sort_values(ascending=False).plot(kind='bar')
<AxesSubplot:xlabel='orderdate'>
df.groupby('productcategory')['sales'].sum().sort_values(ascending=False)
productcategory Technology 5987571.402 Furniture 5119254.482 Office Supplies 3800227.370 Name: sales, dtype: float64
df.groupby('productcategory')['sales'].sum().sort_values(ascending=False).plot(kind='bar')
<AxesSubplot:xlabel='productcategory'>
df.groupby('productcategory')['profit'].sum().sort_values(ascending=False).head(2)
productcategory Technology 855437.2250 Office Supplies 515776.6045 Name: profit, dtype: float64
df.groupby('productcategory')['profit'].sum().sort_values(ascending=False).head().plot(kind='barh')
<AxesSubplot:ylabel='productcategory'>
df.groupby('region')['profit'].sum().sort_values(ascending=False).head()
region South 505678.7515 West 422868.1505 East 297802.5185 Central 297239.2090 Name: profit, dtype: float64
df.groupby('region')['profit'].sum().sort_values(ascending=False).head().plot(kind='bar')
<AxesSubplot:xlabel='region'>
df.pivot_table(index="region", columns="productcategory", values="profit", \
aggfunc='sum', fill_value=0, margins=True)
productcategory | Furniture | Office Supplies | Technology | All |
---|---|---|---|---|
region | ||||
Central | 44509.69 | 121977.2900 | 130752.229 | 2.972392e+05 |
East | -12721.46 | 107470.5935 | 203053.385 | 2.978025e+05 |
South | 62901.90 | 140659.2615 | 302117.590 | 5.056788e+05 |
West | 57684.67 | 145669.4595 | 219514.021 | 4.228682e+05 |
All | 152374.80 | 515776.6045 | 855437.225 | 1.523589e+06 |
df.pivot_table(index="region", columns="productcategory", values="profit", \
aggfunc='sum', fill_value=0, margins=True).plot(kind='bar')
<AxesSubplot:xlabel='region'>
eaststates = df.loc[df['region'] == 'East']
eaststates.groupby('productcategory')['profit'].sum()
eaststates.head(2)
orderid | orderpriority | orderquantity | sales | discount | shipmode | profit | unitprice | shippingcost | customername | customerstate | zipcode | region | customersegment | productcategory | productsub-category | productname | productcontainer | productbasemargin | shipdate | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
orderdate | ||||||||||||||||||||
2010-08-26 | 3 | Low | 6 | 261.54 | 0.04 | Regular Air | -213.25 | 38.94 | 35.00 | Muhammed MacIntyre | New Jersey | 7514 | East | Small Business | Office Supplies | Storage & Organization | Eldon Base for stackable storage shelf, platinum | Large Box | 0.80 | 2010-09-02 |
2012-01-03 | 6 | Not Specified | 2 | 6.93 | 0.01 | Regular Air | -4.64 | 2.08 | 2.56 | Ruben Dartt | New York | 10457 | East | Corporate | Office Supplies | Scissors, Rulers and Trimmers | Kleencut® Forged Office Shears by Acme United ... | Small Pack | 0.55 | 2012-01-04 |
eaststates.pivot_table(index="customerstate", columns="productcategory", values="profit", \
aggfunc='sum', fill_value=0, margins=True)
productcategory | Furniture | Office Supplies | Technology | All |
---|---|---|---|---|
customerstate | ||||
Connecticut | -12852.13 | 20425.2045 | 36178.566 | 43751.6405 |
Delaware | 938.99 | 5662.7815 | 29740.743 | 36342.5145 |
Mass | -5651.18 | 41313.6870 | 33954.677 | 69617.1840 |
New Jersey | -18171.95 | 14495.9140 | 39667.960 | 35991.9240 |
New York | 23014.81 | 25573.0065 | 63511.439 | 112099.2555 |
All | -12721.46 | 107470.5935 | 203053.385 | 297802.5185 |
eaststates.pivot_table(index="customerstate", columns="productcategory", values="profit", \
aggfunc='sum', fill_value=0, margins=True).plot(kind='bar')
<AxesSubplot:xlabel='customerstate'>
nebraska = df.loc[df['customerstate'] == 'Nebraska']
nebyear=nebraska.resample('Y')['profit'].sum().sort_values(ascending=False)
nebyear.head()
orderdate 2009-12-31 18355.1030 2012-12-31 15597.6875 2011-12-31 12363.0425 2010-12-31 12314.2330 2008-12-31 -363.9200 Name: profit, dtype: float64
nebyear=nebraska.resample('Y')['profit'].sum().sort_values(ascending=False).plot(kind='bar')
nebraska.groupby('productsub-category')['profit'].sum().sort_values(ascending=False)
productsub-category Chairs & Chairmats 20138.890 Copiers and Fax 17546.489 Binders and Binder Accessories 8787.901 Telephones and Communication 7394.336 Appliances 5684.610 Office Furnishings 4007.010 Paper 3140.810 Envelopes 753.590 Pens & Art Supplies 707.730 Labels 422.790 Computer Peripherals 57.860 Rubber Bands 16.850 Scissors, Rulers and Trimmers -266.860 Storage & Organization -631.050 Bookcases -1086.110 Office Machines -3146.690 Tables -5262.010 Name: profit, dtype: float64
nebraska.groupby('productsub-category')['profit'].sum().sort_values(ascending=False).plot(kind='bar')
<AxesSubplot:xlabel='productsub-category'>
nebraska = df.loc[df['customerstate'] == 'Nebraska']
nebraska.groupby('zipcode')['profit'].sum().sort_values(ascending=False)
zipcode 68770 19498.4820 68717 12265.1240 69367 10809.8590 68001 7771.4990 68155 7049.8825 69141 1302.5270 69358 185.5075 68016 -616.7350 Name: profit, dtype: float64
nebraska.groupby('zipcode')['profit'].sum().sort_values(ascending=False).plot(kind='bar')
<AxesSubplot:xlabel='zipcode'>
furn = df.loc[df['productcategory'] == 'Furniture']
furn.groupby('productsub-category')['profit'].sum().sort_values(ascending=False)
productsub-category Chairs & Chairmats 149938.23 Office Furnishings 96878.40 Bookcases -33170.68 Tables -61271.15 Name: profit, dtype: float64
furn.groupby('productsub-category')['profit'].sum().sort_values(ascending=False).plot(kind='bar')
<AxesSubplot:xlabel='productsub-category'>
furnoff = df.loc[df['productcategory'] == 'Office Supplies']
furnoff.groupby('productsub-category')['profit'].sum().sort_values(ascending=False)
productsub-category Binders and Binder Accessories 307174.1845 Appliances 98023.0800 Envelopes 48711.0800 Paper 45987.2000 Labels 13689.1500 Pens & Art Supplies 7551.8100 Storage & Organization 2516.9400 Rubber Bands -77.5900 Scissors, Rulers and Trimmers -7799.2500 Name: profit, dtype: float64
furnoff.groupby('productsub-category')['profit'].sum().sort_values(ascending=False).plot(kind='bar')
<AxesSubplot:xlabel='productsub-category'>
furnoff.groupby('productsub-category')['sales'].sum().sort_values(ascending=False)
productsub-category Storage & Organization 1099774.36 Binders and Binder Accessories 1024521.85 Appliances 747333.01 Paper 449496.47 Envelopes 176298.06 Pens & Art Supplies 167525.03 Scissors, Rulers and Trimmers 80996.31 Labels 39042.35 Rubber Bands 15239.93 Name: sales, dtype: float64
tech = df.loc[df['productcategory'] == 'Technology']
tech.groupby('productsub-category')['profit'].sum().sort_values(ascending=False)
productsub-category Telephones and Communication 316951.641 Office Machines 276590.340 Copiers and Fax 167361.464 Computer Peripherals 94533.780 Name: profit, dtype: float64
tech.groupby('productsub-category')['profit'].sum().sort_values(ascending=False).plot(kind='bar')
<AxesSubplot:xlabel='productsub-category'>
df.resample('Y')['profit'].sum().plot()
df.resample('Y')['sales'].sum().plot(color='k')
<AxesSubplot:xlabel='orderdate'>
year=df.resample('Y')['profit'].sum().sort_values(ascending=False)
year
orderdate 2011-12-31 419755.3845 2009-12-31 406718.5260 2010-12-31 337239.4165 2012-12-31 291141.9190 2008-12-31 68733.3835 Name: profit, dtype: float64
year=df.resample('Y')['profit'].sum().sort_values(ascending=False)[:-1]
year
orderdate 2011-12-31 419755.3845 2009-12-31 406718.5260 2010-12-31 337239.4165 2012-12-31 291141.9190 Name: profit, dtype: float64
year=df.resample('Y')['profit'].sum().sort_values(ascending=False)[:-1].plot(kind='bar')
d2009=df.loc['2009']
d2009.head()
orderid | orderpriority | orderquantity | sales | discount | shipmode | profit | unitprice | shippingcost | customername | customerstate | zipcode | region | customersegment | productcategory | productsub-category | productname | productcontainer | productbasemargin | shipdate | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
orderdate | ||||||||||||||||||||
2009-04-16 | 69 | Not Specified | 42 | 1186.0600 | 0.09 | Regular Air | 511.690 | 30.93 | 3.92 | Jonathan Doherty | Ohio | 43567 | Central | Corporate | Furniture | Office Furnishings | Advantus Employee of the Month Certificate Fra... | Small Pack | 0.44 | 2009-04-18 |
2009-04-16 | 69 | Not Specified | 28 | 51.5300 | 0.03 | Express Air | 0.350 | 1.68 | 0.70 | Jonathan Doherty | Ohio | 43567 | Central | Corporate | Office Supplies | Pens & Art Supplies | Newell 308 | Wrap Bag | 0.60 | 2009-04-18 |
2009-02-27 | 96 | High | 37 | 4158.1235 | 0.01 | Regular Air | 1228.887 | 125.99 | 8.99 | Keith Dawkins | Texas | 73301 | Central | Home Office | Technology | Telephones and Communication | SC7868i | Small Box | 0.55 | 2009-03-01 |
2009-12-11 | 97 | Medium | 26 | 75.5700 | 0.03 | Regular Air | 28.240 | 2.89 | 0.50 | Craig Yedwab | Arizona | 85354 | West | Consumer | Office Supplies | Labels | Avery 498 | Small Box | 0.38 | 2009-12-12 |
2009-04-30 | 224 | Not Specified | 25 | 184.8600 | 0.09 | Regular Air | -33.950 | 7.38 | 5.21 | Bart Folk | Connecticut | 6439 | East | Corporate | Furniture | Office Furnishings | Eldon® Expressions™ Wood Desk Accessories, Oak | Small Box | 0.56 | 2009-05-01 |
d2009.groupby('region')['profit'].sum().sort_values(ascending=False)
region South 160088.0775 West 121861.4615 Central 78111.7495 East 46657.2375 Name: profit, dtype: float64
d2009.groupby('zipcode')['profit'].sum().sort_values(ascending=True).head()
zipcode 10501 -16557.0850 35077 -11928.9000 93456 -11522.6075 8989 -7748.9515 49877 -7703.7515 Name: profit, dtype: float64
d2009.groupby('zipcode')['profit'].sum().sort_values(ascending=False).plot(kind='bar')
<AxesSubplot:xlabel='zipcode'>
d2009.groupby('productcategory')['profit'].sum().sort_values(ascending=True)
productcategory Furniture 71904.900 Office Supplies 167174.061 Technology 167639.565 Name: profit, dtype: float64
d2009.groupby('productcategory')['profit'].sum().sort_values(ascending=True).plot(kind='bar')
<AxesSubplot:xlabel='productcategory'>
d2009.groupby('productsub-category')['profit'].sum().sort_values(ascending=False)
productsub-category Binders and Binder Accessories 109436.821 Telephones and Communication 81614.967 Chairs & Chairmats 54335.650 Copiers and Fax 49296.048 Office Furnishings 25795.640 Computer Peripherals 18595.350 Appliances 18495.330 Office Machines 18133.200 Paper 16214.710 Storage & Organization 14197.810 Envelopes 7046.170 Labels 2936.880 Bookcases 2394.950 Pens & Art Supplies 1746.230 Rubber Bands 27.820 Scissors, Rulers and Trimmers -2927.710 Tables -10621.340 Name: profit, dtype: float64
d2009.groupby('productsub-category')['profit'].sum().sort_values(ascending=False).plot(kind='bar')
<AxesSubplot:xlabel='productsub-category'>
d2009.groupby('productsub-category')['sales'].sum().sort_values(ascending=False)
productsub-category Office Machines 595848.910 Chairs & Chairmats 492855.750 Telephones and Communication 467231.791 Tables 443875.096 Binders and Binder Accessories 327280.380 Storage & Organization 318305.210 Copiers and Fax 317123.330 Bookcases 226657.260 Computer Peripherals 203511.070 Office Furnishings 177617.260 Appliances 157640.070 Paper 120688.030 Pens & Art Supplies 39460.140 Envelopes 31183.570 Scissors, Rulers and Trimmers 17067.110 Labels 8970.850 Rubber Bands 2554.310 Name: sales, dtype: float64
d2009.groupby('productsub-category')['sales'].sum().sort_values(ascending=False).plot(kind='bar')
<AxesSubplot:xlabel='productsub-category'>
d2009.groupby('productname')['profit'].sum().sort_values(ascending=True).head(1)
productname Okidata Pacemark 4410N Wide Format Dot Matrix Printer -26610.81 Name: profit, dtype: float64
d2009.groupby('productname')['sales'].sum().sort_values(ascending=True).head(1)
productname *Staples* Packaging Labels 6.13 Name: sales, dtype: float64
d2009jan = df.loc['1/1/2009':'1/31/2009']
d2009jan.groupby('region')['profit'].sum().sort_values(ascending=False)
region West 3374.3950 Central 2516.9330 South -463.1045 East -12084.3320 Name: profit, dtype: float64
d2009jan.groupby('region')['profit'].sum().sort_values(ascending=False).plot(kind='bar')
<AxesSubplot:xlabel='region'>
d2009jan.groupby('region')['sales'].sum().sort_values(ascending=False)
region South 110149.4795 East 92038.6130 West 56051.6735 Central 37041.4990 Name: sales, dtype: float64
d2009jan.groupby('region')['sales'].sum().sort_values(ascending=False).plot(kind='bar')
<AxesSubplot:xlabel='region'>
d2009jan.groupby('zipcode')['profit'].sum().sort_values(ascending=True).head()
zipcode 10501 -16253.680 35077 -11769.170 21230 -2073.898 5544 -2041.280 23111 -1271.710 Name: profit, dtype: float64
d2009jan.groupby('zipcode')['sales'].sum().sort_values(ascending=True).head()
zipcode 21290 29.23 21022 31.36 80746 82.21 14925 93.73 83810 118.51 Name: sales, dtype: float64
d2009jan.groupby('productcategory')['profit'].sum().sort_values(ascending=True).head()
productcategory Technology -10547.7010 Office Supplies -1516.0075 Furniture 5407.6000 Name: profit, dtype: float64
d2009jan.groupby('productcategory')['sales'].sum().sort_values(ascending=True).head()
productcategory Office Supplies 56731.790 Furniture 119225.416 Technology 119324.059 Name: sales, dtype: float64
d2009jan.groupby('productsub-category')['profit'].sum().sort_values(ascending=True).head(7)
productsub-category Copiers and Fax -8496.27 Office Machines -2435.86 Storage & Organization -2093.51 Tables -2010.32 Scissors, Rulers and Trimmers -1767.60 Bookcases -1196.03 Pens & Art Supplies -3.21 Name: profit, dtype: float64
d2009jan.groupby('productsub-category')['sales'].sum().sort_values(ascending=True).head(7)
productsub-category Labels 444.57 Envelopes 1247.72 Pens & Art Supplies 2446.02 Binders and Binder Accessories 5200.74 Paper 5561.16 Scissors, Rulers and Trimmers 9631.78 Office Furnishings 13760.32 Name: sales, dtype: float64
d2009jan.groupby('productname')['profit'].sum().sort_values(ascending=True).head(1)
productname Polycom ViewStation™ ISDN Videoconferencing Unit -16253.68 Name: profit, dtype: float64
d2009jan.groupby('productname')['sales'].sum().sort_values(ascending=True).head(1)
productname Newell 343 3.77 Name: sales, dtype: float64
# Q1 which state in the west is most profitable in product category?
weststates = df.loc[df['region'] == 'West']
weststates.pivot_table(index="customerstate", columns="productcategory", values="profit", \
aggfunc='sum', fill_value=0, margins=True)
productcategory | Furniture | Office Supplies | Technology | All |
---|---|---|---|---|
customerstate | ||||
Arizona | 3611.15 | 7282.9630 | 49567.281 | 60461.3940 |
California | 13792.46 | 29394.0675 | 21987.698 | 65174.2255 |
Colorado | 5253.71 | 21477.1575 | 46844.225 | 73575.0925 |
Idaho | 10535.04 | 34028.3840 | 34866.825 | 79430.2490 |
New Mexico | 6139.93 | 9170.8750 | 20075.526 | 35386.3310 |
Utah | 3054.18 | 29536.3950 | 34355.836 | 66946.4110 |
Washington | 15298.20 | 14779.6175 | 11816.630 | 41894.4475 |
All | 57684.67 | 145669.4595 | 219514.021 | 422868.1505 |
# Q2 what is the most profitable year in Idaho?
idaho = df.loc[df['customerstate'] == 'Idaho']
idahoyear=idaho.resample('Y')['profit'].sum().sort_values(ascending=False)
idahoyear.head()
orderdate 2011-12-31 35479.0070 2008-12-31 15273.7520 2009-12-31 11837.8540 2010-12-31 11579.3785 2012-12-31 5260.2575 Name: profit, dtype: float64
# Q3 What are the top 2 least profitable(or have losses) product names in Idaho?
idaho.groupby('productname')['profit'].sum().sort_values(ascending=True).head(2)
productname Polycom ViewStation™ ISDN Videoconferencing Unit -11797.31 Canon PC1060 Personal Laser Copier -2297.48 Name: profit, dtype: float64