###############汇总与分组##########################
import seaborn as sns
import numpy as np
import pandas as pd
planets = sns.load_dataset('planets')
planets.shape
# print(planets.head())
rng = np.random.RandomState(42)
ser = pd.Series(rng.rand(5))
ser.sum()
ser.mean()
df = pd.DataFrame(
{'A': rng.rand(5), 'B': rng.rand(5)}
)
print(df.mean())
print(df.mean(axis='columns'))
print(planets.dropna().describe())
'''
Aggregation Description
count() Total number of items
first(), last() First and last item
mean(), median() Mean and median
min(), max() Minimum and maximum
std(), var() Standard deviation and variance
mad() Mean absolute deviation
prod() Product of all items
sum() Sum of all items
These are all methods of DataFrame and Series objects.
'''
# GroupBy: Split, Apply, Combine
df = pd.DataFrame({'key': ['A', 'B', 'C', 'A', 'B', 'C'],
'data': range(6)}, columns=['key', 'data'])
print(df)
print(df.groupby('key').sum())
group_op = planets.groupby('method')['orbital_period']
print(group_op.median())
# 可以直接迭代每个组
for (method, group) in planets.groupby('method'):
print("{0:30s} shape={1}".format(method, group.shape))
gdu = planets.groupby('method')['year'].describe().unstack()
print(gdu)
print(df)
'''
aggregate()方法允许更大的灵活性。
它可以采用字符串,函数或其列表,然后一次计算所有聚合
'''
dfk = df.groupby('key').aggregate(['min', np.median, max])
print(dfk)
rng = np.random.RandomState(0)
df = pd.DataFrame(
{'key': ['A', 'B', 'C', 'A', 'B', 'C'],
'data1': range(6),
'data2': rng.randint(0, 10, 6)},
columns=['key', 'data1', 'data2'])
dfagg = df.groupby('key').aggregate({'data1': 'min',
'data2': 'max'})
print(dfagg)
# 过滤
def filter_func(x):
return x['data2'].std() > 4
print(df)
print('-----------------------------')
print(df.groupby('key').std())
print('-----------------------------')
print(df.groupby('key').filter(filter_func))
print('-----------------------------')
dfr = df.groupby('key').transform(lambda x: x - x.mean())
print(dfr)
# apply() 方法的应用
'''
该apply()方法使您可以将任意函数应用于分组结果。
该函数应采用DataFrame,并返回Pandas对象(
例如DataFrame,Series)或标量;合并操作将根据返回的输出类型进行调整。
'''
def norm_by_data2(x):
# x is a DataFrame of group values
x['data1'] /= x['data2'].sum()
return x
df_norm = df.groupby('key').apply(norm_by_data2)
print(df_norm)
# 指定分离键
L = [0, 1, 0, 1, 2, 0]
print(df.groupby(L).sum())
# 字典或series 映射索引到组
df2 = df.set_index('key')
print(df2)
mapping = {'A': 'vowel', 'B': 'consonant', 'C': 'consonant'}
print('------------------------------')
print(df2.groupby(mapping).sum())
# 与映射类似,您可以传递将输入索引值并输出组的任何Python函数
print(df2.groupby(str.lower).mean())
print(df2.groupby([str.lower, mapping]).mean())
# 分组案例
decade = 10 * (planets['year'] // 10)
decade = decade.astype(str) + 's'
decade.name = 'decade'
print(planets.groupby(['method', decade])['number'].sum().unstack().fillna(0))
原创-Pandas心法之数据汇总与分组-6
未经允许不得转载:同乐学堂 » 原创-Pandas心法之数据汇总与分组-6