Dask在文件中间收到“ FileNotFoundError:[Errno 2]没有这样的文件或目录”

时间:2019-05-06 01:39:24

标签: python dask

我正在用一个普通的txt文件制作一个书包-它有很多评论,以两个换行符分隔。但是,有时-我真的无法预测何时-它在处理过程中会给我FileNotFoundError: [Errno 2] No such file or directory: '/mnt/c/Workspaces/Books/Dask/foods.txt'

这是实际的代码

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
import dask.dataframe as dd
from dask.diagnostics import ProgressBar
import numpy as np
import dask.bag as bag
import os

def get_next_part(file, start_index, span_index=0, blocksize=1000):
    file.seek(start_index)
    buffer = file.read(blocksize + span_index).decode('cp1252')
    delimiter_position = buffer.find('\n\n')
    if delimiter_position == -1:
        return get_next_part(file, start_index, span_index + blocksize)
    else:
        file.seek(start_index)
        return start_index, delimiter_position

def get_item(filename, start_index, delimiter_position, encoding='cp1252'):
    with open(filename, 'rb') as file_handle:
        file_handle.seek(start_index)
        text = file_handle.read(delimiter_position).decode(encoding)
        return dict((element.split(': ')[0], element.split(': ')[1])
                               if len(element.split(': ')) > 1
                               else ('unknown', element)
                               for element in text.strip().split('\n'))    


with open(f"{os.getcwd()}/foods.txt", 'rb') as file_handle:
    size = file_handle.seek(0,2) - 1
    more_data = True
    output = []
    current_position = next_position = 0
    while more_data:
        if current_position >= size:
            more_data = False
        else:
            current_position, next_position = get_next_part(file_handle, current_position, 0)
            output.append((current_position, next_position))
            current_position = current_position + next_position + 2

with ProgressBar():
    reviews = (bag.from_sequence(output, npartitions=104)
               .map(lambda x: get_item(f"{os.getcwd()}/foods.txt", 
                                       x[0], 
                                       x[1]))
              .compute())

有时它可以正常工作,但有时它却可以为我提供一些帮助(每次都有不同的百分比):

[##########                              ] | 26% Completed | 54.3s
---------------------------------------------------------------------------
FileNotFoundError                         Traceback (most recent call last)
<ipython-input-1-90a316620d10> in <module>()
     42 with ProgressBar():
     43     reviews = (bag.from_sequence(output, npartitions=104)
---> 44                .map(lambda x: get_item(f"{os.getcwd()}/foods.txt", 
     45                                        x[0],
     46                                        x[1]))

~/anaconda3/envs/py36/lib/python3.6/site-packages/dask/base.py in compute(self, **kwargs)
    154         dask.base.compute
    155         """
--> 156         (result,) = compute(self, traverse=False, **kwargs)
    157         return result
    158 

~/anaconda3/envs/py36/lib/python3.6/site-packages/dask/base.py in compute(*args, **kwargs)
    396     keys = [x.__dask_keys__() for x in collections]
    397     postcomputes = [x.__dask_postcompute__() for x in collections]
--> 398     results = schedule(dsk, keys, **kwargs)
    399     return repack([f(r, *a) for r, (f, a) in zip(results, postcomputes)])
    400 

~/anaconda3/envs/py36/lib/python3.6/site-packages/dask/multiprocessing.py in get(dsk, keys, num_workers, func_loads, func_dumps, optimize_graph, pool, **kwargs)
    190                            get_id=_process_get_id, dumps=dumps, loads=loads,
    191                            pack_exception=pack_exception,
--> 192                            raise_exception=reraise, **kwargs)
    193     finally:
    194         if cleanup:

~/anaconda3/envs/py36/lib/python3.6/site-packages/dask/local.py in get_async(apply_async, num_workers, dsk, result, cache, get_id, rerun_exceptions_locally, pack_exception, raise_exception, callbacks, dumps, loads, **kwargs)
    460                         _execute_task(task, data)  # Re-execute locally
    461                     else:
--> 462                         raise_exception(exc, tb)
    463                 res, worker_id = loads(res_info)
    464                 state['cache'][key] = res

~/anaconda3/envs/py36/lib/python3.6/site-packages/dask/compatibility.py in reraise(exc, tb)
    109     def reraise(exc, tb=None):
    110         if exc.__traceback__ is not tb:
--> 111             raise exc.with_traceback(tb)
    112         raise exc
    113 

~/anaconda3/envs/py36/lib/python3.6/site-packages/dask/local.py in execute_task()
    228     try:
    229         task, data = loads(task_info)
--> 230         result = _execute_task(task, data)
    231         id = get_id()
    232         result = dumps((result, id))

~/anaconda3/envs/py36/lib/python3.6/site-packages/dask/core.py in _execute_task()
    117         func, args = arg[0], arg[1:]
    118         args2 = [_execute_task(a, cache) for a in args]
--> 119         return func(*args2)
    120     elif not ishashable(arg):
    121         return arg

~/anaconda3/envs/py36/lib/python3.6/site-packages/dask/bag/core.py in reify()
   1589 def reify(seq):
   1590     if isinstance(seq, Iterator):
-> 1591         seq = list(seq)
   1592     if seq and isinstance(seq[0], Iterator):
   1593         seq = list(map(list, seq))

~/anaconda3/envs/py36/lib/python3.6/site-packages/dask/bag/core.py in map_chunk()
   1749     else:
   1750         for a in zip(*args):
-> 1751             yield f(*a)
   1752 
   1753     # Check that all iterators are fully exhausted

<ipython-input-1-90a316620d10> in <lambda>()
     44                .map(lambda x: get_item(f"{os.getcwd()}/foods.txt", 
     45                                        x[0],
---> 46                                        x[1]))
     47               .compute())

<ipython-input-1-90a316620d10> in get_item()
     18 
     19 def get_item(filename, start_index, delimiter_position, encoding='cp1252'):
---> 20     with open(filename, 'rb') as file_handle:
     21         file_handle.seek(start_index)
     22         text = file_handle.read(delimiter_position).decode(encoding)

FileNotFoundError: [Errno 2] No such file or directory: '/mnt/c/Workspaces/Books/Dask/foods.txt'

我尝试弄乱分区号-将其保留为默认值(101),或确保它是4的倍数。似乎没有效果。

有人知道这是怎么回事吗?如果我第二次运行它通常会起作用,但是仍然很难解决。

我正在使用最新版本的Dask。使用conda,一切都在Jupyterlab中,而我正在从Windows Subsystem for Linux运行它

谢谢!

1 个答案:

答案 0 :(得分:0)

无法修复我的初始读取方法,但是能够找到另一种进行并行读取的方式(也可以使用本地Dask对象!)

各节之间用\n\n分隔,linedelimiter的{​​{1}}参数并不意味着我认为的含义,但是通过这种方式,我能够找到一种方法来获取这些部分我需要:Why `linedelimiter` does not work for bag.read_text?

bag