tynbl.github.io

Pandas数据操作

import pandas as pd
ser_obj = pd.Series(range(5), index = ['a', 'b', 'c', 'd', 'e'])
print(ser_obj.head())
a    0
b    1
c    2
d    3
e    4
dtype: int32
# 行索引
print(ser_obj['a'])
print(ser_obj[0])
0
0
# 切片索引
print(ser_obj[1:3])
print(ser_obj['b':'d'])
b    1
c    2
dtype: int32
b    1
c    2
d    3
dtype: int32
# 不连续索引
print(ser_obj[[0, 2, 4]])
print(ser_obj[['a', 'e']])
a    0
c    2
e    4
dtype: int32
a    0
e    4
dtype: int32
# 布尔索引
ser_bool = ser_obj > 2
print(ser_bool)
print(ser_obj[ser_bool])

print(ser_obj[ser_obj > 2])
a    False
b    False
c    False
d     True
e     True
dtype: bool
d    3
e    4
dtype: int32
d    3
e    4
dtype: int32
import numpy as np

df_obj = pd.DataFrame(np.random.randn(5,4), columns = ['a', 'b', 'c', 'd'])
print(df_obj.head())
          a         b         c         d
0  1.487338 -0.131267  0.798305  0.456526
1  0.845968  0.582888 -0.940616 -2.007079
2  0.136282 -0.899088  0.705549 -1.411679
3 -0.416619 -0.379917  0.518185 -0.999199
4  0.945567 -0.548615 -0.265608 -1.448001
# 列索引
print('列索引')
print(df_obj['a']) # 返回Series类型
#print(type(df_obj[[0]])) # 返回DataFrame类型

# 不连续索引
print('不连续索引')
print(df_obj[['a','c']])
#print(df_obj[[1, 3]])
列索引
0    1.487338
1    0.845968
2    0.136282
3   -0.416619
4    0.945567
Name: a, dtype: float64
不连续索引
          a         c
0  1.487338  0.798305
1  0.845968 -0.940616
2  0.136282  0.705549
3 -0.416619  0.518185
4  0.945567 -0.265608
print(ser_obj)
print(df_obj)
a    0
b    1
c    2
d    3
e    4
dtype: int32
          a         b         c         d
0  1.487338 -0.131267  0.798305  0.456526
1  0.845968  0.582888 -0.940616 -2.007079
2  0.136282 -0.899088  0.705549 -1.411679
3 -0.416619 -0.379917  0.518185 -0.999199
4  0.945567 -0.548615 -0.265608 -1.448001
# 标签索引 loc
# Series
print(ser_obj['b':'d'])
print(ser_obj.loc['b':'d'])

# DataFrame
print(df_obj['a'])
print(df_obj.loc[0:2, 'a'])
b    1
c    2
d    3
dtype: int32
b    1
c    2
d    3
dtype: int32
0    1.487338
1    0.845968
2    0.136282
3   -0.416619
4    0.945567
Name: a, dtype: float64
0    1.487338
1    0.845968
2    0.136282
Name: a, dtype: float64
print(ser_obj)
a    0
b    1
c    2
d    3
e    4
dtype: int32
# 整型位置索引 iloc
print(ser_obj['b':'d'])
print(ser_obj.iloc[1:3])

# DataFrame
print(df_obj.iloc[0:2, 0]) # 注意和df_obj.loc[0:2, 'a']的区别
print(df_obj.loc[0:2, 'a'])
b    1
c    2
d    3
dtype: int32
b    1
c    2
dtype: int32
0    1.487338
1    0.845968
Name: a, dtype: float64
0    1.487338
1    0.845968
2    0.136282
Name: a, dtype: float64
print(ser_obj)
a    0
b    1
c    2
d    3
e    4
dtype: int32
# 混合索引 ix
print(ser_obj.ix[1:3])
print(ser_obj.ix['b':'c'])

# DataFrame
print(df_obj.ix[0:2, 0]) # 先按标签索引尝试操作,然后再按位置索引尝试操作
b    1
c    2
dtype: int32
b    1
c    2
dtype: int32
0    1.487338
1    0.845968
2    0.136282
Name: a, dtype: float64


d:\python34\lib\site-packages\ipykernel\__main__.py:2: DeprecationWarning: 
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate_ix
  from ipykernel import kernelapp as app
d:\python34\lib\site-packages\ipykernel\__main__.py:6: DeprecationWarning: 
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate_ix
s1 = pd.Series(range(10, 20), index = range(10))
s2 = pd.Series(range(20, 25), index = range(5))

print('s1: ' )
print(s1)

print('') 

print('s2: ')
print(s2)
s1: 
0    10
1    11
2    12
3    13
4    14
5    15
6    16
7    17
8    18
9    19
dtype: int32

s2: 
0    20
1    21
2    22
3    23
4    24
dtype: int32
# Series 对齐运算
s1 + s2
0    30.0
1    32.0
2    34.0
3    36.0
4    38.0
5     NaN
6     NaN
7     NaN
8     NaN
9     NaN
dtype: float64
import numpy as np

df1 = pd.DataFrame(np.ones((2,2)), columns = ['a', 'b'])
df2 = pd.DataFrame(np.ones((3,3)), columns = ['a', 'b', 'c'])

print('df1: ')
print(df1)

print('') 
print('df2: ')
print(df2)
df1: 
     a    b
0  1.0  1.0
1  1.0  1.0

df2: 
     a    b    c
0  1.0  1.0  1.0
1  1.0  1.0  1.0
2  1.0  1.0  1.0
# DataFrame对齐操作
df1 + df2
a b c
0 2.0 2.0 NaN
1 2.0 2.0 NaN
2 NaN NaN NaN
# 填充未对齐的数据进行运算
print(s1)
print(s2)

s1.add(s2, fill_value = -1)
0    10
1    11
2    12
3    13
4    14
5    15
6    16
7    17
8    18
9    19
dtype: int32
0    20
1    21
2    22
3    23
4    24
dtype: int32





0    30.0
1    32.0
2    34.0
3    36.0
4    38.0
5    14.0
6    15.0
7    16.0
8    17.0
9    18.0
dtype: float64
df1.sub(df2, fill_value = 2.)
a b c
0 0.0 0.0 1.0
1 0.0 0.0 1.0
2 1.0 1.0 1.0
# 填充NaN
s3 = s1 + s2
print(s3)
0    30.0
1    32.0
2    34.0
3    36.0
4    38.0
5     NaN
6     NaN
7     NaN
8     NaN
9     NaN
dtype: float64
s3_filled = s3.fillna(-1)
print(s3_filled)
0    30.0
1    32.0
2    34.0
3    36.0
4    38.0
5    -1.0
6    -1.0
7    -1.0
8    -1.0
9    -1.0
dtype: float64
df3 = df1 + df2
print(df3)
     a    b   c
0  2.0  2.0 NaN
1  2.0  2.0 NaN
2  NaN  NaN NaN
df3.fillna(100, inplace = True)
print(df3)
       a      b      c
0    2.0    2.0  100.0
1    2.0    2.0  100.0
2  100.0  100.0  100.0
# Numpy ufunc 函数
df = pd.DataFrame(np.random.randn(5,4) - 1)
print(df)

print(np.abs(df))
          0         1         2         3
0 -1.278841 -0.441358 -0.918558  1.086543
1  0.355898 -0.841866 -2.527455  0.355916
2 -1.455938 -0.485170 -0.427544 -1.265602
3 -0.509453 -1.181119 -1.544357 -3.855732
4 -1.197029  0.076550 -1.092266 -0.578271
          0         1         2         3
0  1.278841  0.441358  0.918558  1.086543
1  0.355898  0.841866  2.527455  0.355916
2  1.455938  0.485170  0.427544  1.265602
3  0.509453  1.181119  1.544357  3.855732
4  1.197029  0.076550  1.092266  0.578271
# 使用apply应用行或列数据
#f = lambda x : x.max()
def f(x):
    return x.max()

print(df.apply(f))
0    0.355898
1    0.076550
2   -0.427544
3    1.086543
dtype: float64
# 指定轴方向
print(df.apply(lambda x : x.max(), axis=1))
0    1.086543
1    0.355916
2   -0.427544
3   -0.509453
4    0.076550
dtype: float64
# 使用applymap应用到每个数据
f2 = lambda x : '%.2f' % x
print(df.applymap(f2))
       0      1      2      3
0  -1.28  -0.44  -0.92   1.09
1   0.36  -0.84  -2.53   0.36
2  -1.46  -0.49  -0.43  -1.27
3  -0.51  -1.18  -1.54  -3.86
4  -1.20   0.08  -1.09  -0.58
s4 = pd.Series(range(10, 15), index = np.random.randint(5, size=5))
print(s4)
3    10
0    11
0    12
3    13
1    14
dtype: int32
# 索引排序
s4.sort_index(ascending=False)
3    10
3    13
1    14
0    11
0    12
dtype: int32
df4 = pd.DataFrame(np.random.randn(3, 4), 
                   index=np.random.randint(3, size=3),
                   columns=np.random.randint(4, size=4))
print(df4)
          0         3         1         3
1 -1.074869  0.256095  0.136766 -0.606702
2 -1.115481  0.739818  1.292220  0.711678
0  0.245830 -0.144434 -1.078077  0.406480
#df4.sort_index(ascending=False)
df4.sort_index(axis=1)
0 1 3 3
1 -1.074869 0.136766 0.256095 -0.606702
2 -1.115481 1.292220 0.739818 0.711678
0 0.245830 -1.078077 -0.144434 0.406480
# 按值排序
df4.sort_values(by=0)
0 3 1 3
2 -1.115481 0.739818 1.292220 0.711678
1 -1.074869 0.256095 0.136766 -0.606702
0 0.245830 -0.144434 -1.078077 0.406480
df_data = pd.DataFrame([np.random.randn(3), [1., np.nan, np.nan],
                       [4., np.nan, np.nan], [1., np.nan, 2.]])
df_data.head()
0 1 2
0 -0.013522 0.22717 -0.202574
1 1.000000 NaN NaN
2 4.000000 NaN NaN
3 1.000000 NaN 2.000000
# isnull
df_data.isnull()
0 1 2
0 False False False
1 False True True
2 False True True
3 False True False
# dropna
df_data.dropna()
#df_data.dropna(axis=1)
0 1 2
0 -0.013522 0.22717 -0.202574
# fillna
df_data.fillna(-100.)
0 1 2
0 -0.013522 0.22717 -0.202574
1 1.000000 -100.00000 -100.000000
2 4.000000 -100.00000 -100.000000
3 1.000000 -100.00000 2.000000