Pandas库学习

2019-07-22

Python库之pandas

环境：Pycharm2019.01，python3.5

Series与DataFrame

Series与DataFrame分别对应于一维序列与二维表结构

Series

创建Series，索引默认从0开始：

import pandas as pd
import numpy as np
s1=pd.Series([1,2,-3,-5])
print(s1)
print(s1.values,type(s1)) # 查看s1的元素
print(s1.index) # 查看s1的索引，显示[0,4)，步长为1
- - - - - - - - - 
0    1
1    2
2   -3
3   -5
dtype: int64
[ 1  2 -3 -5]
RangeIndex(start=0, stop=4, step=1)

pandas是基于numpy来构建的，所以这个一维序列所使用的结构也是array，但在Pycharm下type是Series类型的。

创建索引，不使用默认索引：

s2=pd.Series([1,2.0,-3,-5],index=['a','b','c','d'])
print(s2)
print(s2['c']) # 根据索引取值，设置的索引必须对应值
- - - - - - - - -
a    1.0
b    2.0
c   -3.0
d   -5.0
dtype: float64
-3.0

Series可以看成一个定长的有序字典。字典没有顺序的概念，但Series一旦赋值就固定了。

DataFrame

import pandas as pd
import numpy as np
data={
    'year':[2014,2015,2016,2017],
    'income':[10,20,30,40],
    'outcome':[10,20,30,40]
}
df1=pd.DataFrame(data) 
print(df1)
- - - - - - - - - - - - -
   income  outcome  year
0      10       10  2014
1      20       20  2015
2      30       30  2016
3      40       40  2017

使用其他方法创建DataFrame:

df2=pd.DataFrame(np.arange(10).reshape(2,5))
df3=pd.DataFrame(np.arange(12).reshape(3,4),index=['a','b','c'],columns=[1,5,2,6])
print(df2)
print(df3)
print(df2.index） # 行
print(df2.columns) # 列
print(df2.values) # 输出值
a=df2.values 
print(type(a))  # a是array类型
print(df2.describe()) # 描述df2属性 
- - - - - - - - - - - - 
   0  1  2  3  4
0  0  1  2  3  4
1  5  6  7  8  9
   1  5   2   6
a  0  1   2   3
b  4  5   6   7
c  8  9  10  11
RangeIndex(start=0, stop=2, step=1)
RangeIndex(start=0, stop=5, step=1)
[[0 1 2 3 4]
 [5 6 7 8 9]]
<class 'numpy.ndarray'>
0         1         2         3         4
count  2.000000  2.000000  2.000000  2.000000  2.000000 # 行数
mean   2.500000  3.500000  4.500000  5.500000  6.500000 # 平均值
std    3.535534  3.535534  3.535534  3.535534  3.535534 # 标准差
min    0.000000  1.000000  2.000000  3.000000  4.000000 # 最小值
25%    1.250000  2.250000  3.250000  4.250000  5.250000 # 列和的1/4
50%    2.500000  3.500000  4.500000  5.500000  6.500000
75%    3.750000  4.750000  5.750000  6.750000  7.750000
max    5.000000  6.000000  7.000000  8.000000  9.000000

转置与排序：

df4=pd.DataFrame(np.arange(10).reshape(2,5),index=[1,2],columns=[2,1,5,6,0])
print(df4)
print(df4.T)
print(df4.sort_index(axis=1)) # 按列名大小排序
print(df4.sort_values(by=5)) # 按第5列值大小排序
- - - - - - - - - - - - -
   2  1  5  6  0
1  0  1  2  3  4
2  5  6  7  8  9
   1  2
2  0  5
1  1  6
5  2  7
6  3  8
0  4  9
   0  1  2  5  6
1  4  1  0  2  3
2  9  6  5  7  8
   2  1  5  6  0
1  0  1  2  3  4
2  5  6  7  8  9

pandas选择数据

indexs=pd.date_range('20190105',periods=3)
df5=pd.DataFrame(np.arange(12).reshape(3,4),index=indexs,columns=['A','B','C','D'])
print(df5)
print(df5['B']) # 将DataFrame的列获取为一个Series
# print(df5.B) # 作用同上
print(df5[0:2]) # 获取表的前两行
print(df5.loc['20190107',['B','D']]) # 通过标签选择数据再获取其中的列
print(df5.iloc[0:3,2:4]) # 通过位置选择数据，行列从0开始和numpy类似
print(df5.iloc[[0,2],[1,3]]) # 选择不连续的行列值
- - - - - - - - - - - - - - -
            A  B   C   D
2019-01-05  0  1   2   3
2019-01-06  4  5   6   7
2019-01-07  8  9  10  11
2019-01-05    1
2019-01-06    5
2019-01-07    9
Freq: D, Name: B, dtype: int64
            A  B  C  D
2019-01-05  0  1  2  3
2019-01-06  4  5  6  7
B     9
D    11
Name: 2019-01-07 00:00:00, dtype: int64
             C   D
2019-01-05   2   3
2019-01-06   6   7
2019-01-07  10  11
            B   D
2019-01-05  1   3
2019-01-07  9  11

通过混合标签选择数据：

indexs=pd.date_range('20190105',periods=3)
df6=pd.DataFrame(np.arange(12).reshape(3,4),index=indexs,columns=['A','B','C','D'])
print(df6)
print(df6.ix[1:3,['B','D']]) # 和numpy类似
print(df6.C>=6) # 判断某列与值的关系，这里就可以将逻辑回归数据分开data1=data[y=1]
- - - - - - - - - - - - - - 
            A  B   C   D
2019-01-05  0  1   2   3
2019-01-06  4  5   6   7
2019-01-07  8  9  10  11
            B   D
2019-01-06  5   7
2019-01-07  9  11
2019-01-05    False
2019-01-06     True
2019-01-07     True
Freq: D, Name: C, dtype: bool

pandas赋值及操作

indexs=np.arange(20190101,20190103)
df7=pd.DataFrame(np.arange(10).reshape(2,5),index=indexs,columns=['A',"B",'C','D','E'])
print(df7)
print(df7.iloc[1,3]) # 输出第1行第3列的数
df7[df7.iloc[0:1]<3]=3 # 将第0行小与3的数替换成3
print(df7)
df7['F']=10  # 插入一列并赋值为10，df7.loc['F']=10则表示插入一行赋值为10
print(df7)
- - - - - - - - - - - - - - -
          A  B  C  D  E
20190101  0  1  2  3  4
20190102  5  6  7  8  9
8
          A  B  C  D  E
20190101  3  3  3  3  4
20190102  5  6  7  8  9
          A  B  C  D  E   F
20190101  0  1  2  3  4  10
20190102  5  6  7  8  9  10

插入行或列：

indexs=np.arange(20190101,20190103)
df7=pd.DataFrame(np.arange(10).reshape(2,5),index=indexs,columns=['A',"B",'C','D','E'])
print(df7)
s1=pd.Series([1,2,3,4,5],index=['A','B','C','D','E'])
s1.name='S1'
df7=df7.append(s1) # 在行尾添加
print(df7)
df7.insert(df7.shape[1],'F',10) # 在列尾增添一列，全部赋为0，df7.pop('F')或del df7['F'] 则表示删除这列
print(df7)
df7=df7.drop(['E','F'],axis=1) # 删除列
print(df7)
- - - - - - - - - - - - 
          A  B  C  D  E
20190101  0  1  2  3  4
20190102  5  6  7  8  9
          A  B  C  D  E
20190101  0  1  2  3  4
20190102  5  6  7  8  9
S1        1  2  3  4  5
          A  B  C  D  E   F
20190101  0  1  2  3  4  10
20190102  5  6  7  8  9  10
S1        1  2  3  4  5  10
          A  B  C  D
20190101  0  1  2  3
20190102  5  6  7  8
S1        1  2  3  4

pandas处理丢失数据

indexs=np.arange(1,4)
df1=pd.DataFrame(np.arange(12).reshape(3,4),index=indexs,columns=['A','B','C','D'])
df2=pd.DataFrame(df1,index=indexs,columns=['A','B','C','D','E']) # 没有指定默认为空Nan
print(df1)
print(df2) 
print(df2.dropna(axis=1,how='any')) # 指定列只要有空值就删除这一行，how=['any','all']，any表示任意，all表示全部满足空则删除
print(df2.fillna(value=1)) # 用1填充表格中所有空值
print(np.any(df2.isnull()))  # 是否存在空值，np.all()需要都为空值则True
- - - - - - - - - - - -
   A  B   C   D
1  0  1   2   3
2  4  5   6   7
3  8  9  10  11
   A  B   C   D   E
1  0  1   2   3 NaN
2  4  5   6   7 NaN
3  8  9  10  11 NaN
   A  B   C   D    E
1  0  1   2   3  1.0
2  4  5   6   7  1.0
3  8  9  10  11  1.0
True

pandas读取及写入文件

import pandas as pd
import numpy as np
data=pd.read_csv('file_path') # 读取文件，可选项参数：delimiter=','表示以','为分隔符，dtype=int表示生成int类型数据
data.to_csv('file_path'）# 默认保存为csv格式文件，也可以选择其他格式

pandas数据合并

df1=pd.DataFrame(np.arange(12).reshape(3,4),columns=['A','B','C','D'])
df2=pd.DataFrame(np.arange(12,24).reshape(3,4),columns=['A','B','C','D'])
df3=pd.DataFrame(np.arange(24,36).reshape(3,4),columns=['A','B','C','D'])
df4=pd.concat([df1,df2,df3],axis=0)
print(df4) # 纵向合并，注意列名应该对应，不然出现空值
print(pd.concat([df1,df2,df3],axis=0,ignore_index=True))
# 纵向合并，不考虑原来的index，重新编排index
print(pd.concat([df1,df2,df3],axis=1)) # 横向合并，不指定axis，默认为0
df5=pd.DataFrame(np.arange(24,36).reshape(3,4),columns=['A','B','c','d'])
print(pd.concat([df1,df2,df5],join='inner',ignore_index=True)) # axis默认为0，合并表时inner表示舍弃列名不对应的列，outer表示缺少的部分用Nan填充
df2=pd.DataFrame(np.arange(12,24).reshape(4,3),columns=['A','C','D'])
print(pd.concat([df1,df2],axis=1,join_axes=[df1.index])) # 横向合并，index使用df1的index，不指定则会出现空值
- - - - - - - - - - - - - - - - - - - - - 
    A   B   C   D
0   0   1   2   3
1   4   5   6   7
2   8   9  10  11
0  12  13  14  15
1  16  17  18  19
2  20  21  22  23
0  24  25  26  27
1  28  29  30  31
2  32  33  34  35
    A   B   C   D
0   0   1   2   3
1   4   5   6   7
2   8   9  10  11
3  12  13  14  15
4  16  17  18  19
5  20  21  22  23
6  24  25  26  27
7  28  29  30  31
8  32  33  34  35
   A  B   C   D   A   B   C   D   A   B   C   D
0  0  1   2   3  12  13  14  15  24  25  26  27
1  4  5   6   7  16  17  18  19  28  29  30  31
2  8  9  10  11  20  21  22  23  32  33  34  35
    A   B
0   0   1
1   4   5
2   8   9
3  12  13
4  16  17
5  20  21
6  24  25
7  28  29
8  32  33
   A  B   C   D   A   C   D
0  0  1   2   3  12  13  14
1  4  5   6   7  15  16  17
2  8  9  10  11  18  19  20

pandas合并merge

df1=pd.DataFrame({'key':[1,2,3,4],
                  '1':[11,12,13,14],
                  '2':[21,22,23,24]})
df2=pd.DataFrame({'key':[1,2,3,4],
                  '3':[31,32,33,34],
                  '4':[41,42,43,44]})
print(df1)
print(df2)
print(pd.merge(df1,df2,on='key'))  # 根据key相同的行进行横向合并，还可以加参数how=['left','right','inner','outer']，分别表示只考虑左边右边，默认为inner，outer为用空值补全；可以用参数indicator=Ture显示merge偏向信息
- - - - - - - - - - - - - - - - - - - - - 
    1   2  key
0  11  21    1
1  12  22    2
2  13  23    3
3  14  24    4
    3   4  key
0  31  41    1
1  32  42    2
2  33  43    3
3  34  44    4
    1   2  key   3   4
0  11  21    1  31  41
1  12  22    2  32  42
2  13  23    3  33  43
3  14  24    4  34  44

pandas plot

x=pd.DataFrame(np.random.randn(1000),index=np.arange(1000)) # 随机1000个数
x=x.cumsum() # 累加求和
plt.plot(x) # 绘制
plt.show()

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
x=pd.DataFrame(np.random.randn(1000,4),index=np.arange(1000),columns=['A','B','C','D']) # 随机生成4组随机数，行和列都已经指定
x=x.cumsum() # 累加求和
print(x.head()) # 打印前5行
plt.plot(x)
plt.show()
- - - - - - - - - - - - - - - - - - - - - 
          A         B         C         D
0  1.659744  0.107002 -0.370865 -0.339613
1  0.735340 -0.106536  0.469888 -1.700528
2  2.015053 -0.821541 -0.234350 -1.447786
3  3.367802 -0.139346 -2.070282 -1.300126
4  2.802626 -0.284305 -3.187980 -1.908843