Dask groupby有多列问题

时间:2018-04-02 08:01:57

标签: python dataframe dask dask-distributed

我使用dataframe.from_delayed方法创建了以下数据框,其中包含以下列

_id     hour_timestamp  http_method     total_hits  username    hour    weekday. 

源数据框的一些细节:

hits_rate_stats._meta.dtypes
_id                       object
hour_timestamp    datetime64[ns]
http_method               object
total_hits                object
username                  object
hour                       int64
weekday                    int64
dtype: object
meta index:

RangeIndex(start=0, stop=0, step=1)

当我执行以下代码时

my_df_grouped = my_df.groupby(['username', 'http_method', 'weekday', 'hour'])
my_df_grouped.total_hits.sum().reset_index().compute()

我得到以下异常:

---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-27-b24b24fc86db> in <module>()
----> 1 hits_rate_stats_grouped.total_hits.sum().reset_index().compute()

/home/avlach/virtualenvs/enorasys_sa_v2/local/lib/python2.7/site-packages/dask/base.pyc in compute(self, **kwargs)
    141         dask.base.compute
    142         """
--> 143         (result,) = compute(self, traverse=False, **kwargs)
    144         return result
    145 

/home/avlach/virtualenvs/enorasys_sa_v2/local/lib/python2.7/site-packages/dask/base.pyc in compute(*args, **kwargs)
    390     postcomputes = [a.__dask_postcompute__() if is_dask_collection(a)
    391                     else (None, a) for a in args]
--> 392     results = get(dsk, keys, **kwargs)
    393     results_iter = iter(results)
    394     return tuple(a if f is None else f(next(results_iter), *a)

/home/avlach/virtualenvs/enorasys_sa_v2/local/lib/python2.7/site-packages/distributed/client.pyc in get(self, dsk, keys, restrictions, loose_restrictions, resources, sync, asynchronous, **kwargs)
   2039                 secede()
   2040             try:
-> 2041                 results = self.gather(packed, asynchronous=asynchronous)
   2042             finally:
   2043                 for f in futures.values():

/home/avlach/virtualenvs/enorasys_sa_v2/local/lib/python2.7/site-packages/distributed/client.pyc in gather(self, futures, errors, maxsize, direct, asynchronous)
   1476             return self.sync(self._gather, futures, errors=errors,
   1477                              direct=direct, local_worker=local_worker,
-> 1478                              asynchronous=asynchronous)
   1479 
   1480     @gen.coroutine

/home/avlach/virtualenvs/enorasys_sa_v2/local/lib/python2.7/site-packages/distributed/client.pyc in sync(self, func, *args, **kwargs)
    601             return future
    602         else:
--> 603             return sync(self.loop, func, *args, **kwargs)
    604 
    605     def __repr__(self):

/home/avlach/virtualenvs/enorasys_sa_v2/local/lib/python2.7/site-packages/distributed/utils.pyc in sync(loop, func, *args, **kwargs)
    251             e.wait(10)
    252     if error[0]:
--> 253         six.reraise(*error[0])
    254     else:
    255         return result[0]

/home/avlach/virtualenvs/enorasys_sa_v2/local/lib/python2.7/site-packages/distributed/utils.pyc in f()
    235             yield gen.moment
    236             thread_state.asynchronous = True
--> 237             result[0] = yield make_coro()
    238         except Exception as exc:
    239             logger.exception(exc)

/home/avlach/virtualenvs/enorasys_sa_v2/local/lib/python2.7/site-packages/tornado/gen.pyc in run(self)
   1053 
   1054                     try:
-> 1055                         value = future.result()
   1056                     except Exception:
   1057                         self.had_exception = True

/home/avlach/virtualenvs/enorasys_sa_v2/local/lib/python2.7/site-packages/tornado/concurrent.pyc in result(self, timeout)
    236         if self._exc_info is not None:
    237             try:
--> 238                 raise_exc_info(self._exc_info)
    239             finally:
    240                 self = None

/home/avlach/virtualenvs/enorasys_sa_v2/local/lib/python2.7/site-packages/tornado/gen.pyc in run(self)
   1061                     if exc_info is not None:
   1062                         try:
-> 1063                             yielded = self.gen.throw(*exc_info)
   1064                         finally:
   1065                             # Break up a reference to itself

/home/avlach/virtualenvs/enorasys_sa_v2/local/lib/python2.7/site-packages/distributed/client.pyc in _gather(self, futures, errors, direct, local_worker)
   1354                             six.reraise(type(exception),
   1355                                         exception,
-> 1356                                         traceback)
   1357                     if errors == 'skip':
   1358                         bad_keys.add(key)

/home/avlach/virtualenvs/enorasys_sa_v2/local/lib/python2.7/site-packages/dask/dataframe/core.pyc in apply_and_enforce()
   3354             return meta
   3355         c = meta.columns if isinstance(df, pd.DataFrame) else meta.name
-> 3356         return _rename(c, df)
   3357     return df
   3358 

/home/avlach/virtualenvs/enorasys_sa_v2/local/lib/python2.7/site-packages/dask/dataframe/core.pyc in _rename()
   3391         # deep=False doesn't doesn't copy any data/indices, so this is cheap
   3392         df = df.copy(deep=False)
-> 3393         df.columns = columns
   3394         return df
   3395     elif isinstance(df, (pd.Series, pd.Index)):

/home/avlach/virtualenvs/enorasys_sa_v2/local/lib/python2.7/site-packages/pandas/core/generic.pyc in __setattr__()
   3625         try:
   3626             object.__getattribute__(self, name)
-> 3627             return object.__setattr__(self, name, value)
   3628         except AttributeError:
   3629             pass

pandas/_libs/properties.pyx in pandas._libs.properties.AxisProperty.__set__()

/home/avlach/virtualenvs/enorasys_sa_v2/local/lib/python2.7/site-packages/pandas/core/generic.pyc in _set_axis()
    557 
    558     def _set_axis(self, axis, labels):
--> 559         self._data.set_axis(axis, labels)
    560         self._clear_item_cache()
    561 

/home/avlach/virtualenvs/enorasys_sa_v2/local/lib/python2.7/site-packages/pandas/core/internals.pyc in set_axis()
   3072             raise ValueError('Length mismatch: Expected axis has %d elements, '
   3073                              'new values have %d elements' %
-> 3074                              (old_len, new_len))
   3075 
   3076         self.axes[axis] = new_labels

ValueError: Length mismatch: Expected axis has 5 elements, new values have 2 elements

当我做my_df_grouped.count().reset_index().compute()时它会正常工作,当我做my_df_grouped.sum().reset_index().compute()时,我得到了

/home/avlach/virtualenvs/enorasys_sa_v2/local/lib/python2.7/site-packages/pandas/core/groupby.pyc in _get_grouper()
   2830                     raise ValueError('No group keys passed!')
   2831                 else:
-> 2832                     raise ValueError('multiple levels only valid with '
   2833                                      'MultiIndex')
   2834 

ValueError: multiple levels only valid with MultiIndex

使用虚拟数据在本地重新生成并不会给我这些错误。可能出现什么问题?

编辑: 似乎它正在失去多指数。如果我这样做:

total_hits = my_df_grouped.total_hits.sum()
total_hits._meta.index = pd.MultiIndex(levels=[[],[],[],[],], labels=[[],[],[],[]], names=['username', 'http_method', 'weekday', hour'])

0 个答案:

没有答案