循环使用具有多个groupby函数的pandas数据帧并写入excel

时间:2016-01-15 05:41:43

标签: python pandas dataframe xlsxwriter

我目前有一个脚本用于生成excel输出文件,使用pandas df。我运行脚本5次 - 只更改了我用的组列 - 并将所有5张表格附加到主文件中#39;手动。我想知道如何使用5种不同的groupby函数自动循环我的脚本,同时为输出创建5个单独的xlsx表。

这些是我通常粘贴在' ###列重命名,NaN替换& DataFrame列添加'评价:

grouped = df.groupby(['customer_account', 'CounterPartyID'])

grouped = df.groupby(['customer_account', 'CounterPartyID', 'symbol'])  

grouped = df.groupby(['customer_account', 'CounterPartyID', 'Providers', 'symbol'])

grouped = df.groupby(['Providers', 'customer_account'])

grouped = df.groupby(['Providers', 'symbol']) 
import pandas as pd
import numpy as np
import csv
import time
import glob
import datetime
import re
import sys
import os
from dateutil import relativedelta
from xlsxwriter.utility import xl_rowcol_to_cell


'''This is where I find the file with the compiled data and add the needed columns to the df'''
###  File Finding Stuff
file_names = sorted(glob.glob(r'T:\Tom\Scripts\\' + '*_fillssideclient.csv'), reverse=True)
file = file_names[0]
date = os.path.basename(file)[0:8]
#file = "20151215_fillssideclient.csv"  ### For manual file pulls
df = pd.read_csv(file)

###  Column Renaming, NaN Replacing & DataFrame Column Additions
df.rename(columns={'provider':'Providers'}, inplace=True)
df = df.replace(np.nan,'All Tags', regex=True)
df['five_avg'] =    df.iloc[:, 30:40].sum(axis=1).astype('int64') / 10          #Added column at end of df for 5s avg
df['ten_avg'] =     df.iloc[:, 30:50].sum(axis=1).astype('int64') / 20          #Added column at end of df for 10s avg
df['twenty_avg'] =  df.iloc[:, 30:70].sum(axis=1).astype('int64') / 40          #Added column at end of df for 20s avg



#This is the primary function that I need to have my 5 'groupby' variables loop through and create 5 sheets'''
###  Primary DataFrame Calculations
filled_total =      df['filled'].sum()
order_total =       grouped['filled'].count()
total_tickets =     grouped['filled'].sum()
share =             total_tickets / filled_total
fill_rate =         total_tickets / order_total
total_size =        grouped['fill_size'].sum()
avg_size =          total_size / total_tickets

###  One Second Calculations
one_toxicity =      grouped.apply(lambda x: x['filled'][x['1000'] < -25].sum()) / total_tickets
one_average =       grouped.apply(lambda x: x[x['filled'] == 1]['1000'].mean())
one_low =           grouped.apply(lambda x: x[x['filled'] == 1]['1000'].quantile(.25))
one_med =           grouped.apply(lambda x: x[x['filled'] == 1]['1000'].quantile(.50))
one_high =          grouped.apply(lambda x: x[x['filled'] == 1]['1000'].quantile(.75))

###  Five Second Calculations
#five_toxicity =    grouped.apply(lambda x: x['filled'][x['5000'] < -25].sum()) / total_tickets
five_average =      grouped.apply(lambda x: x[x['filled'] == 1]['five_avg'].mean())
five_low =          grouped.apply(lambda x: x[x['filled'] == 1]['five_avg'].quantile(.25))
five_med =          grouped.apply(lambda x: x[x['filled'] == 1]['five_avg'].quantile(.50))
five_high =         grouped.apply(lambda x: x[x['filled'] == 1]['five_avg'].quantile(.75))
#five_std =         grouped.apply(lambda x: x[x['filled'] == 1]['five_avg'].std())

###  Ten Second Calculations
#ten_toxicity =     grouped.apply(lambda x: x['filled'][x['10000'] < -25].sum()) / total_tickets
ten_average =       grouped.apply(lambda x: x[x['filled'] == 1]['ten_avg'].mean())
ten_low =           grouped.apply(lambda x: x[x['filled'] == 1]['ten_avg'].quantile(.25))
ten_med =           grouped.apply(lambda x: x[x['filled'] == 1]['ten_avg'].quantile(.50))
ten_high =          grouped.apply(lambda x: x[x['filled'] == 1]['ten_avg'].quantile(.75))
#ten_std =          grouped.apply(lambda x: x[x['filled'] == 1]['ten_avg'].std())

###  Twenty Second Calculations
#twenty_toxicity =  grouped.apply(lambda x: x['filled'][x['20000'] < -50].sum()) / total_tickets
twenty_avg =        grouped.apply(lambda x: x[x['filled'] == 1]['twenty_avg'].mean())
twenty_low =        grouped.apply(lambda x: x[x['filled'] == 1]['twenty_avg'].quantile(.25))
twenty_med =        grouped.apply(lambda x: x[x['filled'] == 1]['twenty_avg'].quantile(.50))
twenty_high =       grouped.apply(lambda x: x[x['filled'] == 1]['twenty_avg'].quantile(.75))
#twenty_std =       grouped.apply(lambda x: x[x['filled'] == 1]['twenty_avg'].std())



###  Column Formatting
#comma_fmt =        workbook.add_format({'num_format': '#,##0'})
#money_fmt =        workbook.add_format({'num_format': '$#,##0.000'})
#percent_fmt =      workbook.add_format({'num_format': '0.0%'})
#Still need to figure out how to customize column width, column format and conditional formatting'''
list_of_lists = [
    ['Trades',      total_tickets],
    ['Share %',     share],
    ['Fill Rate',   fill_rate],
    ['Total Size',  total_size],
    ['Avg Size',    avg_size],
    ['1s Toxic',    one_toxicity],
    ['1s Avg',      one_average],
    ['1s 25th',     one_low],
    ['1s 50th',     one_med],
    ['1s 75th',     one_high],
    ['5s Avg',      five_average],
    ['5s 25th',     five_low],
    ['5s 50th',     five_med],
    ['5s 75th',     five_high],
    ['10s Avg',     ten_average],
    ['10s 25th',    ten_low],
    ['10s 50th',    ten_med],
    ['10s 75th',    ten_high],
    ['20s Avg',     twenty_avg],
    ['20s 25th',    twenty_low],
    ['20s 50th',    twenty_med],
    ['20s 75th',    twenty_high]
]
result =            pd.concat([lst[1] for lst in list_of_lists], axis=1)
result.columns =    [lst[0] for lst in list_of_lists]
result =            result[result.Trades > 0]           # Removes results that are less than 1...use '!= 0' to remove only 0 trades




#   This is where I find the output location, declare my 'groupby' variables and execute the script
writer = pd.ExcelWriter(date + '_counterparty_monthly.xlsx', engine='xlsxwriter')

result.to_excel(writer, sheet_name='All Trades')
workbook = writer.book
worksheet = writer.sheets['All Trades']
worksheet.set_zoom(80)

#Worksheet and Print Options
worksheet.hide_gridlines(2)
worksheet.fit_to_pages(1, 1)


writer.save()

1 个答案:

答案 0 :(得分:2)

IIUC你可以添加列列表然后使用for循环。最后,您可以为工作表名称添加数字:

col = [['customer_account', 'CounterPartyID'],
       ['customer_account', 'CounterPartyID', 'symbol'],
       ['customer_account', 'CounterPartyID', 'Providers', 'symbol'],
       ['Providers', 'customer_account'],
       ['Providers', 'symbol']]

for i, col in enumerate(col):
    print col
    print i
    #grouped = df.groupby(col) 

    sheetname = 'All Trades-' + str(i)
    print sheetname

#['customer_account', 'CounterPartyID']
#0
#All Trades-0
#['customer_account', 'CounterPartyID', 'symbol']
#1
#All Trades-1
#['customer_account', 'CounterPartyID', 'Providers', 'symbol']
#2
#All Trades-2
#['Providers', 'customer_account']
#3
#All Trades-3
#['Providers', 'symbol']
#4
#All Trades-4

在第133行使用变量sheetname

#add sheet name
result.to_excel(writer, sheet_name=sheetname)
workbook = writer.book

#add sheet name
worksheet = writer.sheets[sheetname]
worksheet.set_zoom(80)

您只能打开并保存一次excel文件:

#   This is where I find the output location, declare my 'groupby' variables and execute the script
writer = pd.ExcelWriter(date + '_counterparty_monthly.xlsx', engine='xlsxwriter')    


for i, col in enumerate(col):
    #print col
    #print i
    grouped = df.groupby(col) 
.
.
.

    #Worksheet and Print Options
    worksheet.hide_gridlines(2)
    worksheet.fit_to_pages(1, 1)


writer.save()

所有在一起:

import pandas as pd
import numpy as np
import csv
import time
import glob
import datetime
import re
import sys
import os
from dateutil import relativedelta
from xlsxwriter.utility import xl_rowcol_to_cell


'''This is where I find the file with the compiled data and add the needed columns to the df'''
###  File Finding Stuff
file_names = sorted(glob.glob(r'T:\Tom\Scripts\\' + '*_fillssideclient.csv'), reverse=True)
file = file_names[0]
date = os.path.basename(file)[0:8]
#file = "20151215_fillssideclient.csv"  ### For manual file pulls
df = pd.read_csv(file)

###  Column Renaming, NaN Replacing & DataFrame Column Additions
df.rename(columns={'provider':'Providers'}, inplace=True)
df = df.replace(np.nan,'All Tags', regex=True)
df['five_avg'] =    df.iloc[:, 30:40].sum(axis=1).astype('int64') / 10          #Added column at end of df for 5s avg
df['ten_avg'] =     df.iloc[:, 30:50].sum(axis=1).astype('int64') / 20          #Added column at end of df for 10s avg
df['twenty_avg'] =  df.iloc[:, 30:70].sum(axis=1).astype('int64') / 40          #Added column at end of df for 20s avg


col = [['customer_account', 'CounterPartyID'],
       ['customer_account', 'CounterPartyID', 'symbol'],
       ['customer_account', 'CounterPartyID', 'Providers', 'symbol'],
       ['Providers', 'customer_account'],
       ['Providers', 'symbol']]

#   This is where I find the output location, declare my 'groupby' variables and execute the script
writer = pd.ExcelWriter(date + '_counterparty_monthly.xlsx', engine='xlsxwriter')    


for i, col in enumerate(col):
    #print col
    #print i
    grouped = df.groupby(col) 

    sheetname = 'All Trades-' + str(i)
    #print sheetname


    #This is the primary function that I need to have my 5 'groupby' variables loop through and create 5 sheets'''
    ###  Primary DataFrame Calculations
    filled_total =      df['filled'].sum()
    order_total =       grouped['filled'].count()
    total_tickets =     grouped['filled'].sum()
    share =             total_tickets / filled_total
    fill_rate =         total_tickets / order_total
    total_size =        grouped['fill_size'].sum()
    avg_size =          total_size / total_tickets


    ###  One Second Calculations
    one_toxicity =      grouped.apply(lambda x: x['filled'][x['1000'] < -25].sum()) / total_tickets
    one_average =       grouped.apply(lambda x: x[x['filled'] == 1]['1000'].mean())
    one_low =           grouped.apply(lambda x: x[x['filled'] == 1]['1000'].quantile(.25))
    one_med =           grouped.apply(lambda x: x[x['filled'] == 1]['1000'].quantile(.50))
    one_high =          grouped.apply(lambda x: x[x['filled'] == 1]['1000'].quantile(.75))

    ###  Five Second Calculations
    #five_toxicity =    grouped.apply(lambda x: x['filled'][x['5000'] < -25].sum()) / total_tickets
    five_average =      grouped.apply(lambda x: x[x['filled'] == 1]['five_avg'].mean())
    five_low =          grouped.apply(lambda x: x[x['filled'] == 1]['five_avg'].quantile(.25))
    five_med =          grouped.apply(lambda x: x[x['filled'] == 1]['five_avg'].quantile(.50))
    five_high =         grouped.apply(lambda x: x[x['filled'] == 1]['five_avg'].quantile(.75))
    #five_std =         grouped.apply(lambda x: x[x['filled'] == 1]['five_avg'].std())

    ###  Ten Second Calculations
    #ten_toxicity =     grouped.apply(lambda x: x['filled'][x['10000'] < -25].sum()) / total_tickets
    ten_average =       grouped.apply(lambda x: x[x['filled'] == 1]['ten_avg'].mean())
    ten_low =           grouped.apply(lambda x: x[x['filled'] == 1]['ten_avg'].quantile(.25))
    ten_med =           grouped.apply(lambda x: x[x['filled'] == 1]['ten_avg'].quantile(.50))
    ten_high =          grouped.apply(lambda x: x[x['filled'] == 1]['ten_avg'].quantile(.75))
    #ten_std =          grouped.apply(lambda x: x[x['filled'] == 1]['ten_avg'].std())

    ###  Twenty Second Calculations
    #twenty_toxicity =  grouped.apply(lambda x: x['filled'][x['20000'] < -50].sum()) / total_tickets
    twenty_avg =        grouped.apply(lambda x: x[x['filled'] == 1]['twenty_avg'].mean())
    twenty_low =        grouped.apply(lambda x: x[x['filled'] == 1]['twenty_avg'].quantile(.25))
    twenty_med =        grouped.apply(lambda x: x[x['filled'] == 1]['twenty_avg'].quantile(.50))
    twenty_high =       grouped.apply(lambda x: x[x['filled'] == 1]['twenty_avg'].quantile(.75))
    #twenty_std =       grouped.apply(lambda x: x[x['filled'] == 1]['twenty_avg'].std())



    ###  Column Formatting
    #comma_fmt =        workbook.add_format({'num_format': '#,##0'})
    #money_fmt =        workbook.add_format({'num_format': '$#,##0.000'})
    #percent_fmt =      workbook.add_format({'num_format': '0.0%'})
    #Still need to figure out how to customize column width, column format and conditional formatting'''
    list_of_lists = [
        ['Trades',      total_tickets],
        ['Share %',     share],
        ['Fill Rate',   fill_rate],
        ['Total Size',  total_size],
        ['Avg Size',    avg_size],
        ['1s Toxic',    one_toxicity],
        ['1s Avg',      one_average],
        ['1s 25th',     one_low],
        ['1s 50th',     one_med],
        ['1s 75th',     one_high],
        ['5s Avg',      five_average],
        ['5s 25th',     five_low],
        ['5s 50th',     five_med],
        ['5s 75th',     five_high],
        ['10s Avg',     ten_average],
        ['10s 25th',    ten_low],
        ['10s 50th',    ten_med],
        ['10s 75th',    ten_high],
        ['20s Avg',     twenty_avg],
        ['20s 25th',    twenty_low],
        ['20s 50th',    twenty_med],
        ['20s 75th',    twenty_high]
    ]
    result =            pd.concat([lst[1] for lst in list_of_lists], axis=1)
    result.columns =    [lst[0] for lst in list_of_lists]
    result =            result[result.Trades > 0]           # Removes results that are less than 1...use '!= 0' to remove only 0 trades




    result.to_excel(writer, sheet_name=sheetname)
    workbook = writer.book

    #add sheet name
    worksheet = writer.sheets[sheetname]
    worksheet.set_zoom(80)

    #Worksheet and Print Options
    worksheet.hide_gridlines(2)
    worksheet.fit_to_pages(1, 1)


writer.save()