Basic functions to preprocess tabular data before assembling it in a [`DataLoaders`](/data.core.html#DataLoaders) on the GPU.
from local.notebook.showdoc import *

class TabularGPU[source]

TabularGPU(df, procs=None, cat_names=None, cont_names=None, y_names=None, is_y_cat=True, splits=None, do_setup=True) :: Tabular

A DataFrame wrapper that knows which cols are cont/cat/y, and returns rows in __getitem__

TabularProcessors

setups[source]

setups(to:TabularGPU)

encodes[source]

encodes(to:TabularGPU)

df = cudf.from_pandas(pd.DataFrame({'a':[0,1,2,0,2]}))
to = TabularGPU(df, Categorify, 'a')
cat = to.procs.categorify
test_eq(list(cat['a']), ['#na#','0','1','2'])
test_eq(to.a.to_array(), np.array([1,2,3,1,3]))
df1 = cudf.from_pandas(pd.DataFrame({'a':[1,0,3,-1,2]}))
to1 = to.new(df1)
cat(to1)
#Values that weren't in the training df are sent to 0 (na)
test_eq(to1.a.to_array(), np.array([2,1,0,0,3]))
#Test decode
to2 = TabularPandas(to1.items.to_pandas(), None, 'a')
to2 = cat.decode(to2)
test_eq(to2.a, np.array(['1','0','#na#','#na#','2']))
df = cudf.from_pandas(pd.DataFrame({'a':[0,1,2,3,2]}))
to = TabularGPU(df, Categorify, 'a', splits=[[0,1,2], [3,4]])
cat = to.procs.categorify
test_eq(list(cat['a']), ['#na#','0','1','2'])
test_eq(to.a.to_array(), np.array([1,2,3,0,3]))
#TODO Categorical (fails for now)
#df = cudf.from_pandas(pd.DataFrame({'a':pd.Categorical(['M','H','L','M'], categories=['H','M','L'], ordered=True)}))
#to = TabularGPU(df, Categorify, 'a')
#cat = to.procs.categorify
#test_eq(cat['a'].to_host(), ['H','M','L'])
#test_eq(df["a"].to_array(), [2,1,3,2])

setups[source]

setups(to:TabularGPU)

encodes[source]

encodes(to:TabularGPU)

df = cudf.from_pandas(pd.DataFrame({'a':[0,1,2,3,4]}))
to = TabularGPU(df, Normalize, cont_names='a')
norm = to.procs.normalize
x = np.array([0,1,2,3,4])
m,s = x.mean(),x.std()
test_eq(norm.means['a'], m)
test_close(norm.stds['a'], s)
test_close(to.a.to_array(), (x-m)/s)
df1 = cudf.from_pandas(pd.DataFrame({'a':[5,6,7]}))
to1 = to.new(df1)
norm(to1)
test_close(to1.a.to_array(), (np.array([5,6,7])-m)/s)

to2 = TabularPandas(to1.items.to_pandas(), None, cont_names='a')
to2 = norm.decode(to2)
test_close(to2.a, [5,6,7])
df = cudf.from_pandas(pd.DataFrame({'a':[0,1,2,3,4]}))
to = TabularGPU(df, Normalize, cont_names='a', splits=[[0,1,2], [3,4]])
norm = to.procs.normalize

x = np.array([0,1,2])
m,s = x.mean(),x.std()
test_eq(norm.means, {'a': m})
test_close(norm.stds['a'], s)
test_close(to.a.to_array(), (np.array([0,1,2,3,4])-m)/s)

Series.median[source]

Series.median()

Get the median of self

col = cudf.Series([0,1,np.nan,1,2,3,4])
test_eq(col.median(), 1.5)
col = cudf.Series([np.nan,1,np.nan,1,2,3,4])
test_eq(col.median(), 2)

Series.idxmax[source]

Series.idxmax()

Return the index of the first occurence of the max in self

setups[source]

setups(to:TabularGPU)

encodes[source]

encodes(to:TabularGPU)

fill1,fill2,fill3 = (FillMissing(fill_strategy=s) 
                     for s in [FillStrategy.median, FillStrategy.constant, FillStrategy.mode])
df = cudf.from_pandas(pd.DataFrame({'a':[0,1,np.nan,1,2,3,4]}))
df1 = df.copy(); df2 = df.copy()
tos = TabularGPU(df, fill1, cont_names='a'),TabularGPU(df1, fill2, cont_names='a'),TabularGPU(df2, fill3, cont_names='a')

test_eq(fill1.na_dict, {'a': 1.5})
test_eq(fill2.na_dict, {'a': 0})
test_eq(fill3.na_dict, {'a': 1.0})

for t in tos: test_eq(t.cat_names, ['a_na'])

for to_,v in zip(tos, [1.5, 0., 1.]):
    test_eq(to_.a.to_array(), np.array([0, 1, v, 1, 2, 3, 4]))
    test_eq(to_.a_na.to_array(), np.array([0, 0, 1, 0, 0, 0, 0]))
dfa = cudf.from_pandas(pd.DataFrame({'a':[np.nan,0,np.nan]}))
tos = [t.new(o) for t,o in zip(tos,(dfa,dfa.copy(),dfa.copy()))]
for t in tos: t.process()
for to_,v in zip(tos, [1.5, 0., 1.]):
    test_eq(to_.a.to_array(), np.array([v, 0, v]))
    test_eq(to_.a_na.to_array(), np.array([1, 0, 1]))
procs = [Normalize, Categorify, FillMissing, noop]
df = cudf.from_pandas(pd.DataFrame({'a':[0,1,2,1,1,2,0], 'b':[0,1,np.nan,1,2,3,4]}))
to = TabularGPU(df, procs, cat_names='a', cont_names='b')

#Test setup and apply on df_trn
test_eq(to.a.to_array(), [1,2,3,2,2,3,1])
test_eq(to.b_na.to_array(), [1,1,2,1,1,1,1])
x = np.array([0,1,1.5,1,2,3,4])
m,s = x.mean(),x.std()
test_close(to.b.to_array(), (x-m)/s)
test_eq(to.procs.classes, {'a': ['#na#','0','1','2'], 'b_na': ['#na#','False','True']})
#Test apply on y_names
procs = [Normalize, Categorify, FillMissing, noop]
df = cudf.from_pandas(pd.DataFrame({'a':[0,1,2,1,1,2,0], 'b':[0,1,np.nan,1,2,3,4], 'c': ['b','a','b','a','a','b','a']}))
to = TabularGPU(df, procs, cat_names='a', cont_names='b', y_names='c')

test_eq(to.cat_names, ['a', 'b_na'])
test_eq(to.a.to_array(), [1,2,3,2,2,3,1])
test_eq(to.b_na.to_array(), [1,1,2,1,1,1,1])
test_eq(to.c.to_array(), [1,0,1,0,0,1,0])
x = np.array([0,1,1.5,1,2,3,4])
m,s = x.mean(),x.std()
test_close(to.b.to_array(), (x-m)/s)
test_eq(to.procs.classes, {'a': ['#na#','0','1','2'], 'b_na': ['#na#','False','True'], 'c': ['a','b']})
procs = [Normalize, Categorify, FillMissing, noop]
df = cudf.from_pandas(pd.DataFrame({'a':[0,1,2,1,1,2,0], 'b':[0,1,np.nan,1,2,3,4], 'c': ['b','a','b','a','a','b','a']}))
to = TabularGPU(df, procs, cat_names='a', cont_names='b', y_names='c')

test_eq(to.cat_names, ['a', 'b_na'])
test_eq(to.a.to_array(), [1,2,3,2,2,3,1])
test_eq(to.a.dtype,int)
test_eq(to.b_na.to_array(), [1,1,2,1,1,1,1])
test_eq(to.c.to_array(), [1,0,1,0,0,1,0])
procs = [Normalize, Categorify, FillMissing, noop]
df = cudf.from_pandas(pd.DataFrame({'a':[0,1,2,1,1,2,0], 'b':[0,np.nan,1,1,2,3,4], 'c': ['b','a','b','a','a','b','a']}))
to = TabularGPU(df, procs, cat_names='a', cont_names='b', y_names='c', splits=[[0,1,4,6], [2,3,5]])

test_eq(to.cat_names, ['a', 'b_na'])
test_eq(to.a.to_array(), [1,2,2,1,0,2,0])
test_eq(to.a.dtype,int)
test_eq(to.b_na.to_array(), [1,2,1,1,1,1,1])
test_eq(to.c.to_array(), [1,0,0,0,1,0,1])

encodes[source]

encodes(to:TabularGPU)

Integration example

path = untar_data(URLs.ADULT_SAMPLE)
df = cudf.from_pandas(pd.read_csv(path/'adult.csv'))
df_trn,df_tst = df.iloc[:10000].copy(),df.iloc[10000:].copy()
df_trn.head()
<cudf.DataFrame ncols=15 nrows=5 >
cat_names = ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race']
cont_names = ['age', 'fnlwgt', 'education-num']
procs = [Categorify, FillMissing, Normalize]

splits = RandomSplitter()(range_of(df_trn))
%time to = TabularGPU(df_trn, procs, splits=splits, cat_names=cat_names, cont_names=cont_names, y_names="salary")
CPU times: user 755 ms, sys: 36.7 ms, total: 792 ms
Wall time: 798 ms
splits = [list(range(len(splits[0]))), list(range(len(splits[0]), 10000))]
dsrc = Datasets(to, splits=splits, tfms=[None])
dl = TabDataLoader(to.valid, bs=64, num_workers=0)
dl.show_batch()
/home/sgugger/anaconda3/lib/python3.7/site-packages/cudf/io/dlpack.py:83: UserWarning: WARNING: cuDF to_dlpack() produces column-major (Fortran order) output. If the output tensor needs to be row major, transpose the output of this function.
  return cpp_dlpack.to_dlpack(gdf_cols)
age fnlwgt education-num workclass education marital-status occupation relationship race education-num_na salary
0 44.0 282721.999450 15.0 Self-emp-not-inc Prof-school Married-civ-spouse Protective-serv Husband White False >=50k
1 18.0 116528.002955 10.0 Private Some-college Never-married Exec-managerial Not-in-family White False <50k
2 52.0 253783.997089 7.0 Private 11th Divorced Priv-house-serv Unmarried White False <50k
3 39.0 175231.999649 10.0 Federal-gov Some-college Married-civ-spouse Machine-op-inspct Husband White True >=50k
4 44.0 36271.003439 13.0 Private Bachelors Married-civ-spouse Exec-managerial Husband White False <50k
5 53.0 196277.999985 10.0 Private Some-college Widowed Tech-support Not-in-family White False <50k
6 29.0 150860.998472 10.0 Private Some-college Never-married Armed-Forces Not-in-family White False <50k
7 39.0 139647.001399 10.0 Private Some-college Divorced Farming-fishing Unmarried White False <50k
8 49.0 481986.987541 9.0 ? HS-grad Married-civ-spouse Adm-clerical Husband White False <50k
9 48.0 205423.999545 13.0 Private Bachelors Divorced Tech-support Unmarried White False >=50k