Question

我尝试使用GridSearchCV创建一个随机森林模型，但是收到了与param_grid相关的错误：“ValueError：估算器管道的参数max_features无效。请使用`estimator.get_params（）检查可用参数列表。（）“。我正在对文档进行分类，所以我也将tf-idf矢量化器推送到管道。这是代码：

from sklearn import metrics
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, f1_score, accuracy_score, precision_score, confusion_matrix
from sklearn.pipeline import Pipeline

 #Classifier Pipeline
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('classifier', RandomForestClassifier())
])
# Params for classifier
params = {"max_depth": [3, None],
              "max_features": [1, 3, 10],
              "min_samples_split": [1, 3, 10],
              "min_samples_leaf": [1, 3, 10],
              # "bootstrap": [True, False],
              "criterion": ["gini", "entropy"]}

# Grid Search Execute
rf_grid = GridSearchCV(estimator=pipeline , param_grid=params) #cv=10
rf_detector = rf_grid.fit(X_train, Y_train)
print(rf_grid.grid_scores_)

我无法弄清楚错误显示的原因。当我使用GridSearchCV运行决策树时，正在发生相同的顺便说一句。（Scikit-learn 0.17）

Answer 1

您必须将参数分配给管道中的指定步骤。在您的情况下classifier。尝试将classifier__添加到参数名称前面。 Sample pipeline

params = {"classifier__max_depth": [3, None],
              "classifier__max_features": [1, 3, 10],
              "classifier__min_samples_split": [1, 3, 10],
              "classifier__min_samples_leaf": [1, 3, 10],
              # "bootstrap": [True, False],
              "classifier__criterion": ["gini", "entropy"]}

Answer 2

尝试在最终管道对象上运行get_params()，而不仅仅是估算工具。这样，它就可以为网格参数生成所有可用的管道项唯一键。

sorted(pipeline.get_params().keys())

[＆＃39;分类＆＃39 ;, ＆＃39; classifier__bootstrap＆＃39 ;, ＆＃39; classifier__class_weight＆＃39 ;, ＆＃39; classifier__criterion＆＃39 ;, ＆＃39; classifier__max_depth＆＃39 ;, 的＆＃39; classifier__max_features＆＃39;，＆＃39; classifier__max_leaf_nodes＆＃39 ;, ＆＃39; classifier__min_impurity_split＆＃39 ;, ＆＃39; classifier__min_samples_leaf＆＃39 ;, ＆＃39; classifier__min_samples_split＆＃39 ;, ＆＃39; classifier__min_weight_fraction_leaf＆＃39 ;, ＆＃39; classifier__n_estimators＆＃39 ;, ＆＃39; classifier__n_jobs＆＃39 ;, ＆＃39; classifier__oob_score＆＃39 ;, ＆＃39; classifier__random_state＆＃39 ;, ＆＃39; classifier__verbose＆＃39 ;, ＆＃39; classifier__warm_start＆＃39 ;, ＆＃39;步骤＆＃39 ;, ＆＃39; TFIDF＆＃39 ;, ＆＃39; tfidf__analyzer＆＃39 ;, ＆＃39; tfidf__binary＆＃39 ;, ＆＃39; tfidf__decode_error＆＃39 ;, ＆＃39; tfidf__dtype＆＃39 ;, ＆＃39; tfidf__encoding＆＃39 ;, ＆＃39; tfidf__input＆＃39 ;, ＆＃39; tfidf__lowercase＆＃39 ;, ＆＃39; tfidf__max_df＆＃39 ;, ＆＃39; tfidf__max_features＆＃39 ;, ＆＃39; tfidf__min_df＆＃39 ;, ＆＃39; tfidf__ngram_range＆＃39 ;, ＆＃39; tfidf__norm＆＃39 ;, ＆＃39; tfidf__preprocessor＆＃39 ;, ＆＃39; tfidf__smooth_idf＆＃39 ;, ＆＃39; tfidf__stop_words＆＃39 ;, ＆＃39; tfidf__strip_accents＆＃39 ;, ＆＃39; tfidf__sublinear_tf＆＃39 ;, ＆＃39; tfidf__token_pattern＆＃39 ;, ＆＃39; tfidf__tokenizer＆＃39 ;, ＆＃39; tfidf__use_idf＆＃39 ;, ＆＃39; tfidf__vocabulary＆＃39;]

当您使用Piplines的简短make_pipeline()语法时，这非常有用，您不必为管道项目的标签打扰：

pipeline = make_pipeline(TfidfVectorizer(), RandomForestClassifier()) sorted(pipeline.get_params().keys())

[＆＃39; randomforestclassifier＆＃39 ;, ＆＃39; randomforestclassifier__bootstrap＆＃39 ;, ＆＃39; randomforestclassifier__class_weight＆＃39 ;, ＆＃39; randomforestclassifier__criterion＆＃39 ;, ＆＃39; randomforestclassifier__max_depth＆＃39 ;, 的＆＃39; randomforestclassifier__max_features＆＃39;，＆＃39; randomforestclassifier__max_leaf_nodes＆＃39 ;, ＆＃39; randomforestclassifier__min_impurity_split＆＃39 ;, ＆＃39; randomforestclassifier__min_samples_leaf＆＃39 ;, ＆＃39; randomforestclassifier__min_samples_split＆＃39 ;, ＆＃39; randomforestclassifier__min_weight_fraction_leaf＆＃39 ;, ＆＃39; randomforestclassifier__n_estimators＆＃39 ;, ＆＃39; randomforestclassifier__n_jobs＆＃39 ;, ＆＃39; randomforestclassifier__oob_score＆＃39 ;, ＆＃39; randomforestclassifier__random_state＆＃39 ;, ＆＃39; randomforestclassifier__verbose＆＃39 ;, ＆＃39; randomforestclassifier__warm_start＆＃39 ;, ＆＃39;步骤＆＃39 ;, ＆＃39; tfidfvectorizer＆＃39 ;, ＆＃39; tfidfvectorizer__analyzer＆＃39 ;, ＆＃39; tfidfvectorizer__binary＆＃39 ;, ＆＃39; tfidfvectorizer__decode_error＆＃39 ;, ＆＃39; tfidfvectorizer__dtype＆＃39 ;, ＆＃39; tfidfvectorizer__encoding＆＃39 ;, ＆＃39; tfidfvectorizer__input＆＃39 ;, ＆＃39; tfidfvectorizer__lowercase＆＃39 ;, ＆＃39; tfidfvectorizer__max_df＆＃39 ;, ＆＃39; tfidfvectorizer__max_features＆＃39 ;, ＆＃39; tfidfvectorizer__min_df＆＃39 ;, ＆＃39; tfidfvectorizer__ngram_range＆＃39 ;, ＆＃39; tfidfvectorizer__norm＆＃39 ;, ＆＃39; tfidfvectorizer__preprocessor＆＃39 ;, ＆＃39; tfidfvectorizer__smooth_idf＆＃39 ;, ＆＃39; tfidfvectorizer__stop_words＆＃39 ;, ＆＃39; tfidfvectorizer__strip_accents＆＃39 ;, ＆＃39; tfidfvectorizer__sublinear_tf＆＃39 ;, ＆＃39; tfidfvectorizer__token_pattern＆＃39 ;, ＆＃39; tfidfvectorizer__tokenizer＆＃39 ;, ＆＃39; tfidfvectorizer__use_idf＆＃39 ;, ＆＃39; tfidfvectorizer__vocabulary＆＃39;]

Answer 3

ValueError: Invalid parameter min_sample_leaf for estimator RandomForestClassifier(n_estimators=1200, n_jobs=1). Check the list of available parameters with `estimator.get_params().keys()`.

即使我尝试了上面的第一种方法，我也遇到了这个错误：

from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

grid = {'classifier__n_estimators': [10,100,200,500,1000,1200],
       'classifier__max_depth': [None, 5, 10, 20, 30],
       'classifier__max_features': ['auto','sqrt'],
       'classifier__min_sample_split': [2,4,6],
       'classifier__min_sample_leaf': [1,2,4]}

np.random.seed(42)

#split into X and y 
X = heart_disease_shuf.drop('target', axis=1)
y = heart_disease_shuf['target']

# split into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Instantiate RandomForestClassifier
clf = RandomForestClassifier(n_jobs=1)

# Setup RandomizedSearchCV
rs_clf = RandomizedSearchCV(estimator=clf, 
                            param_distributions=grid,
                            n_iter=10, # number of models to try
                            cv=5,
                            verbose=2)

# Fit the RandomizedSearchCV version of clf
rs_clf
rs_clf.fit(X_train, y_train);

使用GridSearchCV的随机森林 - param_grid出错

3 个答案: