In [13]:
#imports
import pandas as pd
import numpy as np

#read CSVs - CSVs were already cleaned via regex using Notepad++
customerdemand = pd.read_csv('Customer_Demand.csv', index_col=False)
customerdemand = customerdemand.drop_duplicates()

weightvolume = pd.read_csv('Sku_weight_volume.csv', index_col=False)
weightvolume = weightvolume.drop_duplicates()

#rename fields so they use consistent cases for ease of us later
weightvolume = weightvolume.rename(index=str, columns={'Weight': 'weight', 'Cubic': 'cubic'})

#check
customerdemand.shape + weightvolume.shape
Out[13]:
(1264461, 4, 55799, 4)
In [15]:
#combine the two files
combine = weightvolume.merge(customerdemand, left_on='product', right_on='product', how='inner')
In [16]:
#don't think we need these anymore since I changed to inner join from outer join
filtered = combine[(combine['customer'] != 0) & (combine['customer'].notnull())
                    & (combine['weight'] >= 0) & (combine['weight'].notnull())
                    & (combine['cubic'] >= 0) & (combine['cubic'].notnull())
                    & (combine['price'] >= 0) & (combine['price'].notnull())]

#if these are the same size then we don't need the above filters
filtered.shape + combine.shape
Out[16]:
(1244188, 7, 1244188, 7)
In [17]:
#so we didn't need the above filters!
In [18]:
#we estimate revenue using piece price times quantity
filtered = filtered.assign(revenue = filtered['price']*filtered['quantity'])

#then sum up all revenue
filtered['revenue'].sum()
Out[18]:
118725118