Question

大家好，谢谢您的光临，我在尝试获取预测表训练模型时遇到问题，我尝试了很多代码来解决此问题，基本上，我的数据包含很多类型，这是一个sample of my data，我想使用StandardScaler我的数据... 这是原始代码...

# Load the training set
df = pd.read_csv('training_data/R_training_set.csv', 
low_memory=False)
df = df.dropna(how='any',axis=0)
df.shape

ohe_fields=['one_way','surface_type','street_type','hour','weekday','month']

# One-Hot encode a couple of variables
df_ohe = pd.get_dummies(df,columns=ohe_fields)

# Get the one-hot variable names
ohe_feature_names = pd.get_dummies(df[ohe_fields],columns=ohe_fields).columns.tolist()
df_ohe.head()

这就是我的数据

 # Sinuosity is typically close to 1, even for moderately curvy roads. A high sinuosity means a longer road.
feature_transforms = {
    'sinuosity': np.log
}
for feature,transform in feature_transforms.items():
    df_ohe[feature] = transform(df_ohe[feature])

# Continuously valued features
float_feature_names = [
    'accident_counts',
    'speed_limit',
    'aadt',
    'surface_width',
    'sinuosity',
    'euclidean_length',
    'segment_length',
    'road_orient_approx',
    'Rain',
    'dust',
    'temperature',
    'visibility',
    'wind_speed',
    'proximity_to_billboard',
    'proximity_to_major_road',
    'proximity_to_signal',
    'proximity_to_nearest_intersection',
    'proximity_to_nearest_exit',
    'population_density',
    'Hopspot'
]
float_features = df_ohe.xs(float_feature_names,axis=1).values

# Use scikit-learn's StandardScaler
scaler = StandardScaler()
float_scaled = scaler.fit_transform(float_features)
#print (float_features.mean(axis=0))

df_ohe[float_feature_names] = float_scaled
with open('scalers_4.pkl','wb') as fp:
    pickle.dump(scaler,fp)


y = df['target'].values
binary_feature_names = [
    'dew_point_temperature',
    'EXTREME_AIR_TEMPERATURE',
    'at_exit',
    'at_intersection',
]
df_ohe = df_ohe.xs(float_feature_names+binary_feature_names+ohe_feature_names,axis=1)


X = df_ohe.values
y = df['target'].values
feature_names = df_ohe.columns.tolist()


wrangler = {
    'float_scaler_mean': float_features.mean(axis=0),
    'float_scaler_std': float_scaled,
    'float_feature_names': float_feature_names,
    'ohe_fields': ohe_fields,
    'feature_names': feature_names,
    'feature_transforms': feature_transforms 
}

P.S我使用xgboost算法训练数据

with open('wrangler_2.pkl','rb') as fp:
    wrangler = pickle.load(fp)
float_scaler_mean = wrangler['float_scaler_mean']
float_scaler_std = wrangler['float_scaler_std']
float_feature_names = wrangler['float_feature_names']
ohe_fields = wrangler['ohe_fields']
feature_names = wrangler['feature_names']
booster = xgboost.Booster()
booster.load_model('new_0003.model')


def make_test_set(df,wrangler):
    float_scaler_mean = wrangler['float_scaler_mean']
    float_scaler_std = wrangler['float_scaler_std']
    float_feature_names = wrangler['float_feature_names']
    ohe_fields = wrangler['ohe_fields']
    feature_names = wrangler['feature_names']
    print(len(feature_names))
    df_ohe = pd.get_dummies(df,columns=ohe_fields)

    float_features = df.xs(float_feature_names,axis=1).values
    float_features = (float_features - float_scaler_mean) / float_scaler_std
    for i,fname in enumerate(float_feature_names):
        df_ohe[fname] = float_features[:,i]

    empty_features = list(set(feature_names) - set(df_ohe.columns.tolist()))

    #for f in empty_features:
    #df_ohe[f] = 0

    df_ohe = df_ohe[feature_names]#.drop(columns=['SegID.1','geometry','SHAPE']+drop_columns,errors='ignore')

    print(df_ohe.columns)
    print(df_ohe.columns.tolist())
    X = df_ohe.values
    feature_names = df_ohe.columns.tolist()
    return X, feature_names

X,names = make_test_set(test_df,wrangler)
print (X.shape)
print (X[0])

我收到错误

TypeError: unsupported operand type(s) for -: 'int' and 'StandardScaler'

我做错了吗？用正确的方式引导我，谢谢...

Answer 1

创建wrangler变量后，您已传递了scaler对象的StandardScalar值。尝试假设您在定义wrangler时想要做什么，我可以想到：

wrangler = {
    'float_scaler_mean': float_features.mean(axis=0),
    'float_scaler_std': float_scaled,
    'float_feature_names': float_feature_names,
    'ohe_fields': ohe_fields,
    'feature_names': feature_names,
    'feature_transforms': feature_transforms 
}

您到底想做什么？您不能将float_features = (float_features - float_scaler_mean) / float_scaler_std对象作为StandardScaler

使用float_scaler_mean

TypeError：-：“ int”和“ StandardScaler”的不受支持的操作数类型

1 个答案: