我在Python中关注自然语言处理的tutorial。我正在以一种我觉得更容易阅读的方式构建代码。
class Datasets(object):
labeled_training_data = pandas.read_csv(r"G:\Downloads\Kaggle\NLP\labeledTrainData.tsv\labeledTrainData.tsv",
header=0,
delimiter="\t",
quoting=3)
test_data = pandas.read_csv(r"G:\Downloads\Kaggle\NLP\testData.tsv\testData.tsv",
header=0,
delimiter="\t",
quoting=3)
unlabeled_training_data = pandas.read_csv(r"G:\Downloads\Kaggle\NLP\unlabeledTrainData.tsv\unlabeledTrainData.tsv",
header=0,
delimiter="\t",
quoting=3)
class TextCleaner(object):
@classmethod
def convert_to_word_list(html_text, remove_stopwords = False):
cleaned_text = BeautifulSoup(html_text).get_text()
cleaned_text = re.sub("[^a-zA-Z]", " ", cleaned_text)
all_words = cleaned_text.lower().split()
if remove_stopwords:
stop_words = set(stopwords.words("english"))
remaining_words = [word for word in all_words if not word in stop_words]
return(all_words)
@classmethod
def __splitIntoSentences(text, tokenizer, remove_stopwords = False):
sentences_in_text = tokenizer.tokenize(text.strip())
non_empty_sentences = []
for sentence in sentences_in_text:
if len(sentence) > 0:
non_empty_sentences.append(__convertToWordList(sentence, remove_stopwords))
return non_empty_sentences
@classmethod
def get_all_sentences_from_training_sets(tokenizer):
all_sentences = []
print "Parsing sentences from the labeled training set..."
for review in Datasets.labeled_training_data["review"]:
all_sentences.append(TextCleaner.__splitIntoSentences(review, tokenizer))
print "Parsing sentences from the unlabeled training set..."
for review in Datasets.unlabeled_training_data["review"]:
all_sentences.append(TextCleaner.__splitIntoSentences(review, tokenizer))
print "There are %d sentences in our training sets." % len(all_sentences)
return all_sentences
当我在REPL中运行这两行时出现问题:
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
all_sentences = TextCleaner.get_all_sentences_from_training_sets(tokenizer)
我收到以下错误消息:
TypeError:getAllSentencesFromTrainingData()只取一个参数 (2给出)
我已阅读documentation中的nltk.data.load()
,但我仍然不知道导致错误消息的原因。
帮助?
答案 0 :(得分:2)
classmethod
的第一个参数应该是cls
,它将被填充(编辑:由Python运行时,而不是你)和调用该方法的特定类。