Useful links

Slides

Snippet for Wegmans Data

# Define functions to parse txt files containing itmes, stores, customers, and transactions
def parseItem(s):
    l=s.split('|')
    return Row(item_number=int(l[0]), 
               dept_categ_class=l[1],               
               item_des=l[2],
               item_unt_qty=float(l[3]), 
               size_unit_desc=l[4], 
               brand_code=l[5], 
               dept_num=int(l[6]), 
               dept_name=l[7], 
               categ_num=int(l[8]), 
               categ_name=l[9], 
               class_num=int(l[10]),
               class_name=l[11])
def parseCustomer(s):
    l=s.split('|')
    return Row(hshld_acct=int(l[0]),
               birth_yr_head_hh=l[1],
               hh_income=l[2],
               hh_size=l[3],
               adult_count=l[4],
               child_count=l[5],
               birth_yr_oldest=l[6],
               birth_yr_youngest=l[7],
               bad_address=l[8],
               privacy=l[9],
               application_date=datetime.strptime(l[10],'%Y-%m-%d'),
               wine_email_sent=int(l[11]),
               wine_email_open=int(l[12]),
               wine_email_click=int(l[13]))
def parsePostrans(s):
    l=s.split('|')
    return Row(hshld_acct=int(l[0]),
               acct_num=int(l[1]),
               trans_num=int(l[2]),
               trans_date=datetime.strptime(l[3],'%Y-%m-%d'),
               store_num=int(l[4]),
               item_number=int(l[5]),
               dept_categ_class=l[6],
               unit_count=int(l[7]),
               net_sales=float(l[8]),
               gross_sales=float(l[9]),
               manuf_coupon=float(l[10]))