Pandas Dataframe列,将值逗号分隔成新数据框

时间:2019-04-05 16:25:39

标签: python pandas

我有一个要导入到数据框中的csv。我正在尝试将包含一堆用逗号分隔的值的单个列拆分为行。

const express = require('express');
const path = require('path');
const bodyParser = require('body-parser');
const session = require('express-session');
const cors = require('cors');
const mongoose = require('mongoose');
const errorHandler = require('errorhandler');

//Configure mongoose's promise to global promise
mongoose.promise = global.Promise;

//Configure isProduction variable
const isProduction = process.env.NODE_ENV === 'production';

//Initiate our app
const app = express();

//Configure our app
app.use(cors());
app.use(require('morgan')('dev'));
app.use(bodyParser.urlencoded({ extended: false }));
app.use(bodyParser.json());
app.use(express.static(path.join(__dirname, 'public')));
app.use(session({ secret: 'bookmarks-darius', cookie: { maxAge: 60000 }, resave: false, saveUninitialized: false }));

if(!isProduction) {
  app.use(errorHandler());
}

//Configure Mongoose
mongoose.connect('mongodb://localhost/bookmarks', { useNewUrlParser: true });
mongoose.set('debug', true);

//Models & routes
require('./models/Users');
require('./models/Bookmarks');
require('./config/passport');
app.use(require('./routes'));

// express doesn't consider not found 404 as an error so we need to handle 404 explicitly handle 404 error
app.use(function(req, res, next) {
  if (err) {
    next(err);
  } else { // no error thrown by another route, so we must not have found a route, therefore return a 404
    let err = new Error('Not Found');
    err.status = 404;
    next(err);
  }
});

//Error handlers & middlewares
if(!isProduction) {
  app.use(function (err, req, res, next) {
    res.status(err.status || 500);

    res.json({
      errors: {
        message: err.message,
        error: err,
      },
    });
  });
}

app.use(function (err, req, res, next) {
  res.status(err.status || 500);

  res.json({
    errors: {
      message: err.message,
      error: {},
    },
  });
});

app.listen(8000, () => console.log('Server running on http://localhost:8000/'));

这是我想出的那段代码,是的,我知道列标题是荒谬的,但是我不制作csvs。因此,它带有以下标头:

df_supplier = pd.read_csv(wf['local_filename'])
print(list(df_supplier))
col = 'Commodities (Use Ctrl to select multiple)'
melt_col = 'Supplier (DTRM ID)'
df_supplier_commodities = df_supplier.loc[:, col]\                            
                                     .apply(pd.Series)\
                                     .reset_index()\
                                     .melt(id_vars=melt_col)\
                                     .dropna()\
                                     .loc[:[melt_col, col]]\
                                     .set_index(melt_col)

必要的标头是供应商(DTRM ID),然后是每个商品(使用Ctrl选择多个)。一个供应商可以将多个商品包含到一个供应商ID中,因此,每行都有一个具有适当供应商ID的商品。

以下代码错误:

['Supplier (DTRM ID)', 'Status', 'Sent for Approval Date', 'Approval Date', 'Legal Company Name', 'Supplier ID', 'Company Description (Owner To Complete)', 'Parent Supplier ID', 'Parent Supplier Name', 'List of Affiliates', 'Category Manager', 'Country', 'DUNS code', 'Trade register name', 'Commodities (Use Ctrl to select multiple)', 'Default Commodity', 'City', 'State', 'Payment Terms', 'Deactivated', 'Tag', 'Created by', 'Creation Date']

但是print(list(df_supplier))显示密钥在那里。我在做什么错了?

我想确保我已经清楚了,所以我将举例说明数据框中的数据布局:

Traceback (most recent call last):
  File "/home/ec2-user/determine_etl/lib/python3.6/site-packages/pandas/core/indexes/base.py", line 2656, in get_loc
    return self._engine.get_loc(key)
  File "pandas/_libs/index.pyx", line 108, in pandas._libs.index.IndexEngine.get_loc
  File "pandas/_libs/index.pyx", line 132, in pandas._libs.index.IndexEngine.get_loc
  File "pandas/_libs/hashtable_class_helper.pxi", line 1601, in pandas._libs.hashtable.PyObjectHashTable.get_item
  File "pandas/_libs/hashtable_class_helper.pxi", line 1608, in pandas._libs.hashtable.PyObjectHashTable.get_item
KeyError: 'Supplier (DTRM ID)'

这是我要获取的输出:

+--------------------+---------------------------------------------+
| Supplier (DTRM ID) |  Commodities (Use Ctrl to select multiple)  |
+--------------------+---------------------------------------------+
|              12333 | Strawberry, Raspberry, Flamingo, Snozzberry |
+--------------------+---------------------------------------------+

我以为我使用Code可以做到这一点,但它告诉我Supplier(DTRM ID)不是有效的密钥(请参阅回溯)

2 个答案:

答案 0 :(得分:0)

听起来你有类似的东西:

df = pd.DataFrame({
                  'A': ['11, 5.1, 2.8','6, 4, 0','0, 2, 0']
                })

       A
0   11, 5.1, 2.8
1   6, 4, 0
2   0, 2, 0  

A列带有“,”分隔的值。

您可以执行以下操作将每个值放入其自己的列中:

df['A'].str.split(',', expand = True)

您将获得以下信息:

    0   1   2
0   11  5.1 2.8
1   6   4   0
2   0   2   0

带有0,1,2列。然后,您可以使用.rename()更改列名,并使用.T进行转置并使它们成为行。如果没有示例数据框架,则很难准确地理解您的尝试。

编辑:

这对我有用:

pd.concat([df['Supplier (DTRM ID)'], df['Commodities (Use Ctrl to select multiple)'].str.split(',', expand = True)], axis = 1)\
        .melt(id_vars=['Supplier (DTRM ID)'])\
        .sort_values(by = 'Supplier (DTRM ID)')\
        .rename(columns = {'value': 'Commodities (Use Ctrl to select multiple)'})\
        .drop(columns = ['variable'])\
        .dropna()

(\是为了便于阅读)

答案 1 :(得分:0)

import pandas as pd

df = pd.DataFrame({'Supplier': [12333, 12334], 'Commodities': ['Strawberry, Raspberry, Flamingo, Snozzberry', 'Steak, Lobster, Salmon, Tuna']})

# display(df)
   Supplier                                  Commodities
0     12333  Strawberry, Raspberry, Flamingo, Snozzberry
1     12334                 Steak, Lobster, Salmon, Tuna

# split the strings into lists
df['Commodities'] = df['Commodities'].str.split(', ')

# explode the lists
df = df.explode('Commodities').reset_index(drop=True)

# display(df)
   Supplier Commodities
0     12333  Strawberry
1     12333   Raspberry
2     12333    Flamingo
3     12333  Snozzberry
4     12334       Steak
5     12334     Lobster
6     12334      Salmon
7     12334        Tuna