我先得到这些错误:
TypeError: 'encoding' is an invalid keyword argument for this function
然后我在iO
前添加了open
,但我收到了这个错误:
UnicodeDecodeError: 'utf8' codec can't decode byte 0xfb in position 10: invalid start byte
class MySentences(object):
def __init__(self, dirname, encoding='utf8'):
self.dirname = dirname
self.encoding = encoding
def __iter__(self):
for fname in os.listdir(self.dirname):
sub_dir = os.path.join(self.dirname, fname)
for fname in os.listdir(sub_dir):
text_file = os.path.join(sub_dir, fname)
for line in open(text_file, encoding=self.encoding):
yield nltk.word_tokenize(line, language='english')