DataLoader code example

Example: pytorch data

cat_cols = ["Hour","AMorPM", "Weekday"]
cont_cols = ['pickup_longitude',
       'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude',
       'passenger_count', 'dist_km']

y_col = ["fare_amount"]

for cat in cat_cols:
    df[cat] = df[cat].astype("category")
    
df = shuffle(df, random_state=101)
df.reset_index(drop=True, inplace=True)

cats = np.stack([df[col].cat.codes.values for col in cat_cols], 1)
conts = np.stack([df[col].values for col in cont_cols],1)
cats = torch.tensor(cats, dtype=torch.int64)
conts = torch.tensor(conts, dtype=torch.float)
y = torch.tensor(df[y_col].values,dtype=torch.float).reshape(-1,1) #regression
# classification: y = torch.tensor(df[y_col].values,dtype=torch.float).flatten()

cat_sizes = [len(df[col].cat.categories) for col in cat_cols]
emb_sizes = [(size, min(50, (size+1)//2)) for size in cat_sizes]

b = 30000 # suggested batch size
t = 5000  # suggested test size

cat_train = cats[:b-t]
cat_test  = cats[b-t:b]
con_train = conts[:b-t]
con_test  = conts[b-t:b]
y_train   = y[:b-t]
y_test    = y[b-t:b]