import numpy as np
import pandas as pd
from pandas import Series, DataFrame
# 读入城市天气csv文件
df = pd.read_csv('/Users/bennyrhys/Desktop/数据分析可视化-数据集/homework/city_weather.csv')
df
|
date |
city |
temperature |
wind |
---|
0 |
03/01/2016 |
BJ |
8 |
5 |
1 |
17/01/2016 |
BJ |
12 |
2 |
2 |
31/01/2016 |
BJ |
19 |
2 |
3 |
14/02/2016 |
BJ |
-3 |
3 |
4 |
28/02/2016 |
BJ |
19 |
2 |
5 |
13/03/2016 |
BJ |
5 |
3 |
6 |
27/03/2016 |
SH |
-4 |
4 |
7 |
10/04/2016 |
SH |
19 |
3 |
8 |
24/04/2016 |
SH |
20 |
3 |
9 |
08/05/2016 |
SH |
17 |
3 |
10 |
22/05/2016 |
SH |
4 |
2 |
11 |
05/06/2016 |
SH |
-10 |
4 |
12 |
19/06/2016 |
SH |
0 |
5 |
13 |
03/07/2016 |
SH |
-9 |
5 |
14 |
17/07/2016 |
GZ |
10 |
2 |
15 |
31/07/2016 |
GZ |
-1 |
5 |
16 |
14/08/2016 |
GZ |
1 |
5 |
17 |
28/08/2016 |
GZ |
25 |
4 |
18 |
11/09/2016 |
SZ |
20 |
1 |
19 |
25/09/2016 |
SZ |
-10 |
4 |
# 根据城市进行分组
g = df.groupby('city')
g
<pandas.core.groupby.generic.DataFrameGroupBy object at 0x1201a06d0>
# 可以直接进行聚合操作
g.mean()
|
temperature |
wind |
---|
city |
|
|
---|
BJ |
10.000 |
2.833333 |
GZ |
8.750 |
4.000000 |
SH |
4.625 |
3.625000 |
SZ |
5.000 |
2.500000 |
g.describe()
|
temperature |
wind |
---|
|
count |
mean |
std |
min |
25% |
50% |
75% |
max |
count |
mean |
std |
min |
25% |
50% |
75% |
max |
---|
city |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
---|
BJ |
6.0 |
10.000 |
8.532292 |
-3.0 |
5.75 |
10.0 |
17.25 |
19.0 |
6.0 |
2.833333 |
1.169045 |
2.0 |
2.00 |
2.5 |
3.00 |
5.0 |
GZ |
4.0 |
8.750 |
11.842719 |
-1.0 |
0.50 |
5.5 |
13.75 |
25.0 |
4.0 |
4.000000 |
1.414214 |
2.0 |
3.50 |
4.5 |
5.00 |
5.0 |
SH |
8.0 |
4.625 |
12.489281 |
-10.0 |
-5.25 |
2.0 |
17.50 |
20.0 |
8.0 |
3.625000 |
1.060660 |
2.0 |
3.00 |
3.5 |
4.25 |
5.0 |
SZ |
2.0 |
5.000 |
21.213203 |
-10.0 |
-2.50 |
5.0 |
12.50 |
20.0 |
2.0 |
2.500000 |
2.121320 |
1.0 |
1.75 |
2.5 |
3.25 |
4.0 |
# 传入聚合函数 使用
g.agg('min')
|
date |
temperature |
wind |
---|
city |
|
|
|
---|
BJ |
03/01/2016 |
-3 |
2 |
GZ |
14/08/2016 |
-1 |
2 |
SH |
03/07/2016 |
-10 |
2 |
SZ |
11/09/2016 |
-10 |
1 |
# 测试自定义聚合函数
def foo(attr):
print(type(attr)),print(attr)
return np.nan
g.agg(foo)
<class 'pandas.core.series.Series'>
0 03/01/2016
1 17/01/2016
2 31/01/2016
3 14/02/2016
4 28/02/2016
5 13/03/2016
Name: date, dtype: object
<class 'pandas.core.series.Series'>
14 17/07/2016
15 31/07/2016
16 14/08/2016
17 28/08/2016
Name: date, dtype: object
<class 'pandas.core.series.Series'>
6 27/03/2016
7 10/04/2016
8 24/04/2016
9 08/05/2016
10 22/05/2016
11 05/06/2016
12 19/06/2016
13 03/07/2016
Name: date, dtype: object
<class 'pandas.core.series.Series'>
18 11/09/2016
19 25/09/2016
Name: date, dtype: object
<class 'pandas.core.series.Series'>
0 8
1 12
2 19
3 -3
4 19
5 5
Name: temperature, dtype: int64
<class 'pandas.core.series.Series'>
14 10
15 -1
16 1
17 25
Name: temperature, dtype: int64
<class 'pandas.core.series.Series'>
6 -4
7 19
8 20
9 17
10 4
11 -10
12 0
13 -9
Name: temperature, dtype: int64
<class 'pandas.core.series.Series'>
18 20
19 -10
Name: temperature, dtype: int64
<class 'pandas.core.series.Series'>
0 5
1 2
2 2
3 3
4 2
5 3
Name: wind, dtype: int64
<class 'pandas.core.series.Series'>
14 2
15 5
16 5
17 4
Name: wind, dtype: int64
<class 'pandas.core.series.Series'>
6 4
7 3
8 3
9 3
10 2
11 4
12 5
13 5
Name: wind, dtype: int64
<class 'pandas.core.series.Series'>
18 1
19 4
Name: wind, dtype: int64
|
date |
temperature |
wind |
---|
city |
|
|
|
---|
BJ |
NaN |
NaN |
NaN |
GZ |
NaN |
NaN |
NaN |
SH |
NaN |
NaN |
NaN |
SZ |
NaN |
NaN |
NaN |
# 自定义函数 聚合 (最大值-最小值)
def foo(attr):
return attr.max() - attr.min()
g.agg(foo)
|
temperature |
wind |
---|
city |
|
|
---|
BJ |
22 |
3 |
GZ |
26 |
3 |
SH |
30 |
3 |
SZ |
30 |
3 |
# 对某两个列做groupBy
g_new = df.groupby(['city', 'wind'])
g_new
<pandas.core.groupby.generic.DataFrameGroupBy object at 0x1213acdd0>
# 细分了风力的group
g_new.groups
{('BJ', 2): Int64Index([1, 2, 4], dtype='int64'),
('BJ', 3): Int64Index([3, 5], dtype='int64'),
('BJ', 5): Int64Index([0], dtype='int64'),
('GZ', 2): Int64Index([14], dtype='int64'),
('GZ', 4): Int64Index([17], dtype='int64'),
('GZ', 5): Int64Index([15, 16], dtype='int64'),
('SH', 2): Int64Index([10], dtype='int64'),
('SH', 3): Int64Index([7, 8, 9], dtype='int64'),
('SH', 4): Int64Index([6, 11], dtype='int64'),
('SH', 5): Int64Index([12, 13], dtype='int64'),
('SZ', 1): Int64Index([18], dtype='int64'),
('SZ', 4): Int64Index([19], dtype='int64')}
# 多列groupby get也要注意
g_new.get_group(('BJ',2))
|
date |
city |
temperature |
wind |
---|
1 |
17/01/2016 |
BJ |
12 |
2 |
2 |
31/01/2016 |
BJ |
19 |
2 |
4 |
28/02/2016 |
BJ |
19 |
2 |
# 多列GroupBy的for
for (name_1, name_2), group in g_new:
print(name_1, name_2)
print(group)
BJ 2
date city temperature wind
1 17/01/2016 BJ 12 2
2 31/01/2016 BJ 19 2
4 28/02/2016 BJ 19 2
BJ 3
date city temperature wind
3 14/02/2016 BJ -3 3
5 13/03/2016 BJ 5 3
BJ 5
date city temperature wind
0 03/01/2016 BJ 8 5
GZ 2
date city temperature wind
14 17/07/2016 GZ 10 2
GZ 4
date city temperature wind
17 28/08/2016 GZ 25 4
GZ 5
date city temperature wind
15 31/07/2016 GZ -1 5
16 14/08/2016 GZ 1 5
SH 2
date city temperature wind
10 22/05/2016 SH 4 2
SH 3
date city temperature wind
7 10/04/2016 SH 19 3
8 24/04/2016 SH 20 3
9 08/05/2016 SH 17 3
SH 4
date city temperature wind
6 27/03/2016 SH -4 4
11 05/06/2016 SH -10 4
SH 5
date city temperature wind
12 19/06/2016 SH 0 5
13 03/07/2016 SH -9 5
SZ 1
date city temperature wind
18 11/09/2016 SZ 20 1
SZ 4
date city temperature wind
19 25/09/2016 SZ -10 4
g.groups
{'BJ': Int64Index([0, 1, 2, 3, 4, 5], dtype='int64'),
'GZ': Int64Index([14, 15, 16, 17], dtype='int64'),
'SH': Int64Index([6, 7, 8, 9, 10, 11, 12, 13], dtype='int64'),
'SZ': Int64Index([18, 19], dtype='int64')}
# 单列group 获取获取某列
g.get_group('BJ')
|
date |
temperature |
wind |
---|
0 |
03/01/2016 |
8 |
5 |
1 |
17/01/2016 |
12 |
2 |
2 |
31/01/2016 |
19 |
2 |
3 |
14/02/2016 |
-3 |
3 |
4 |
28/02/2016 |
19 |
2 |
5 |
13/03/2016 |
5 |
3 |
暂时没有评论,来抢沙发吧~