python-pandas操作

DataFrame
# DataFrame
# DataFrame是⼀个表格型的数据结构,它含有⼀组有序的列,每
# 列可以是不同的值类型(数值、字符串、布尔值等)。
# DataFrame既有⾏索引也有列索引,它可以被看做由
data = {'state': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada', 'Nevada'],
'year': [2000, 2001, 2002, 2001, 2002, 2003],
'pop': [1.5, 1.7, 3.6, 2.4, 2.9, 3.2]}
frame = pd.DataFrame(data)
frame
state year pop
0 Ohio 2000 1.5
1 Ohio 2001 1.7
2 Ohio 2002 3.6
3 Nevada 2001 2.4
4 Nevada 2002 2.9
5 Nevada 2003 3.2
# 对于特别⼤的DataFrame,head⽅法会选取前五⾏:
frame.head()
state year pop
0 Ohio 2000 1.5
1 Ohio 2001 1.7
2 Ohio 2002 3.6
3 Nevada 2001 2.4
4 Nevada 2002 2.9
# 设置列名
pd.DataFrame(data, columns=['year', 'state', 'pop'])
year state pop
0 2000 Ohio 1.5
1 2001 Ohio 1.7
2 2002 Ohio 3.6
3 2001 Nevada 2.4
4 2002 Nevada 2.9
5 2003 Nevada 3.2
frame2 = pd.DataFrame(data, columns=['year', 'state', 'pop', 'debt'],
index=['one', 'two', 'three', 'four',
'five', 'six'])
frame2
frame2.columns
Index(['year', 'state', 'pop', 'debt'], dtype='object')
frame2['state']
frame2.year
one 2000
two 2001
three 2002
four 2001
five 2002
six 2003
Name: year, dtype: int64
# frame2[column]适⽤于任何列的名,但是frame2.column只有
# 在列名是⼀个合理的Python变量名时才适⽤。
frame2.loc['three']
year 2002
state Ohio
pop 3.6
debt NaN
Name: three, dtype: object
# 给那个空
# 的"debt"列赋上⼀个标量值或⼀组值:
# 给空列赋值
frame2['debt'] = 16.5
frame2
frame2['debt'] = np.arange(6.)
frame2
year state pop debt
one 2000 Ohio 1.5 0.0
two 2001 Ohio 1.7 1.0
three 2002 Ohio 3.6 2.0
four 2001 Nevada 2.4 3.0
five 2002 Nevada 2.9 4.0
six 2003 Nevada 3.2 5.0
val = pd.Series([-1.2, -1.5, -1.7], index=['two', 'four', 'five'])
frame2['debt'] = val
frame2
year state pop debt
one 2000 Ohio 1.5 NaN
two 2001 Ohio 1.7 -1.2
three 2002 Ohio 3.6 NaN
four 2001 Nevada 2.4 -1.5
five 2002 Nevada 2.9 -1.7
six 2003 Nevada 3.2 NaN
frame2['eastern'] = frame2.state == 'Ohio'
frame2
year state pop debt eastern
one 2000 Ohio 1.5 NaN True
two 2001 Ohio 1.7 -1.2 True
three 2002 Ohio 3.6 NaN True
four 2001 Nevada 2.4 -1.5 False
five 2002 Nevada 2.9 -1.7 False
six 2003 Nevada 3.2 NaN False
del frame2['eastern']
frame2.columns
Index(['year', 'state', 'pop', 'debt'], dtype='object')
pop = {'Nevada': {2001: 2.4, 2002: 2.9},
'Ohio': {2000: 1.5, 2001: 1.7, 2002: 3.6}}
frame3 = pd.DataFrame(pop)
frame3
Nevada Ohio
2000 NaN 1.5
2001 2.4 1.7
2002 2.9 3.6
frame3.T
year 2000 2001 2002
state
Nevada NaN 2.4 2.9
Ohio 1.5 1.7 3.6
# pd.DataFrame(pop, index=[2001, 2002, 2003])
pdata = {'Ohio': frame3['Ohio'][:-1],
'Nevada': frame3['Nevada'][:2]}
pd.DataFrame(pdata)
Ohio Nevada
2000 1.5 NaN
2001 1.7 2.4
frame3.index.name = 'year'; frame3.columns.name = 'state'
frame3
state Nevada Ohio
year
2000 NaN 1.5
2001 2.4 1.7
2002 2.9 3.6
frame3.values
array([[nan, 1.5],
[2.4, 1.7],
[2.9, 3.6]])
frame2.values
array([[2000, 'Ohio', 1.5, nan],
[2001, 'Ohio', 1.7, -1.2],
[2002, 'Ohio', 3.6, nan],
[2001, 'Nevada', 2.4, -1.5],
[2002, 'Nevada', 2.9, -1.7],
[2003, 'Nevada', 3.2, nan]], dtype=object)
Index Objects
# 索引对象
# pandas的索引对象负责管理轴标签和其他元数据(⽐如轴名称
# 等)。构建Series或DataFrame时,所⽤到的任何数组或其他序
# 列的标签都会被转换成⼀个Index:
obj = pd.Series(range(3), index=['a', 'b', 'c'])
index = obj.index
index
index[1:]
Index(['b', 'c'], dtype='object')
index[1] = 'd' # TypeError

labels = pd.Index(np.arange(3))
labels
obj2 = pd.Series([1.5, -2.5, 0], index=labels)
obj2
obj2.index is labels
True
frame3
frame3.columns
'Ohio' in frame3.columns
2003 in frame3.index
False
dup_labels = pd.Index(['foo', 'foo', 'bar', 'bar'])
dup_labels
Index(['foo', 'foo', 'bar', 'bar'], dtype='object')
Essential Functionality
Reindexing
# 重新索引 重建索引
# pandas对象的⼀个重要⽅法是reindex,其作⽤是创建⼀个新对
# 象,它的数据符合新的索引。
obj = pd.Series([4.5, 7.2, -5.3, 3.6], index=['d', 'b', 'a', 'c'])
obj
d 4.5
b 7.2
a -5.3
c 3.6
dtype: float64
obj2 = obj.reindex(['a', 'b', 'c', 'd', 'e'])
obj2
a -5.3
b 7.2
c 3.6
d 4.5
e NaN
dtype: float64
obj3 = pd.Series(['blue', 'purple', 'yellow'], index=[0, 2, 4])
obj3
obj3.reindex(range(6), method='ffill')
0 blue
1 blue
2 purple
3 purple
4 yellow
5 yellow
dtype: object
frame = pd.DataFrame(np.arange(9).reshape((3, 3)),
index=['a', 'c', 'd'],
columns=['Ohio', 'Texas', 'California'])
frame
frame2 = frame.reindex(['a', 'b', 'c', 'd'])
frame2
Ohio Texas California
a 0.0 1.0 2.0
b NaN NaN NaN
c 3.0 4.0 5.0
d 6.0 7.0 8.0
# 列可以⽤columns关键字重新索引:
states = ['Texas', 'Utah', 'California']
frame.reindex(columns=states)
Texas Utah California
a 1 NaN 2
c 4 NaN 5
d 7 NaN 8
# frame.loc[['a', 'b', 'c', 'd'], states]
Dropping Entries from an Axis
# 丢弃指定轴上的项 删除某一列
obj = pd.Series(np.arange(5.), index=['a', 'b', 'c', 'd', 'e'])
obj
new_obj = obj.drop('c')
new_obj
obj.drop(['d', 'c'])
a 0.0
b 1.0
e 4.0
dtype: float64
data = pd.DataFrame(np.arange(16).reshape((4, 4)),
index=['Ohio', 'Colorado', 'Utah', 'New York'],
columns=['one', 'two', 'three', 'four'])
data
one two three four
Ohio 0 1 2 3
Colorado 4 5 6 7
Utah 8 9 10 11
New York 12 13 14 15
data.drop(['Colorado', 'Ohio'])
one two three four
Utah 8 9 10 11
New York 12 13 14 15
data.drop('two', axis=1)
data.drop(['two', 'four'], axis='columns')
one three
Ohio 0 2
Colorado 4 6
Utah 8 10
New York 12 14
obj.drop('c', inplace=True)
obj
a 0.0
b 1.0
d 3.0
e 4.0
dtype: float64
Indexing, Selection, and Filtering
obj = pd.Series(np.arange(4.), index=['a', 'b', 'c', 'd'])
obj
obj['b']
obj[1]
obj[2:4]
obj[['b', 'a', 'd']]
obj[[1, 3]]
obj[obj < 2]
a 0.0
b 1.0
dtype: float64
obj['b':'c']
b 1.0
c 2.0
dtype: float64
obj['b':'c'] = 5
obj
a 0.0
b 5.0
c 5.0
d 3.0
dtype: float64
data = pd.DataFrame(np.arange(16).reshape((4, 4)),
index=['Ohio', 'Colorado', 'Utah', 'New York'],
columns=['one', 'two', 'three', 'four'])
data
data['two']
data[['three', 'one']]
three one
Ohio 2 0
Colorado 6 4
Utah 10 8
New York 14 12
data[:2]
data[data['three'] > 5]
one two three four
Colorado 4 5 6 7
Utah 8 9 10 11
New York 12 13 14 15
data < 5
data[data < 5] = 0
data
one two three four
Ohio 0 0 0 0
Colorado 0 5 6 7
Utah 8 9 10 11
New York 12 13 14 15
Selection with loc and iloc
# ⽤loc和iloc进⾏选取
# 对于DataFrame的⾏的标签索引,我引⼊了特殊的标签运算符
# loc和iloc。它们可以让你⽤类似NumPy的标记,使⽤轴标签
# (loc)或整数索引(iloc),从DataFrame选择⾏和列的⼦集。
data.loc['Colorado', ['two', 'three']]
two 5
three 6
Name: Colorado, dtype: int32
data.iloc[2, [3, 0, 1]]
data.iloc[2]
data.iloc[[1, 2], [3, 0, 1]]
four one two
Colorado 7 0 5
Utah 11 8 9
data.loc[:'Utah', 'two']
data.iloc[:, :3][data.three > 5]
one two three
Colorado 0 5 6
Utah 8 9 10
New York 12 13 14
Integer Indexes
ser = pd.Series(np.arange(3.)) ser ser[-1]

# 整数索引
# 处理整数索引的pandas对象常常难住新⼿,因为它与Python内
# 置的列表和元组的索引语法不同。
ser = pd.Series(np.arange(3.))
ser
0 0.0
1 1.0
2 2.0
dtype: float64
ser2 = pd.Series(np.arange(3.), index=['a', 'b', 'c'])
ser2[-1]
2.0
ser[:1]
ser.loc[:1]
ser.iloc[:1]
0 0.0
dtype: float64
Arithmetic and Data Alignment
# 算术运算和数据对⻬
# pandas最重要的⼀个功能是,它可以对不同索引的对象进⾏算
# 术运算。在将对象相加时,如果存在不同的索引对,则结果的索
# 引就是该索引对的并集。对于有数据库经验的⽤户,这就像在索
# 引标签上进⾏⾃动外连接
s1 = pd.Series([7.3, -2.5, 3.4, 1.5], index=['a', 'c', 'd', 'e'])
s2 = pd.Series([-2.1, 3.6, -1.5, 4, 3.1],
index=['a', 'c', 'e', 'f', 'g'])
s1
s2
a -2.1
c 3.6
e -1.5
f 4.0
g 3.1
dtype: float64
s1 + s2
a 5.2
c 1.1
d NaN
e 0.0
f NaN
g NaN
dtype: float64
df1 = pd.DataFrame(np.arange(9.).reshape((3, 3)), columns=list('bcd'),
index=['Ohio', 'Texas', 'Colorado'])
df2 = pd.DataFrame(np.arange(12.).reshape((4, 3)), columns=list('bde'),
index=['Utah', 'Ohio', 'Texas', 'Oregon'])
df1
df2
b d e
Utah 0.0 1.0 2.0
Ohio 3.0 4.0 5.0
Texas 6.0 7.0 8.0
Oregon 9.0 10.0 11.0
df1 + df2
b c d e
Colorado NaN NaN NaN NaN
Ohio 3.0 NaN 6.0 NaN
Oregon NaN NaN NaN NaN
Texas 9.0 NaN 12.0 NaN
Utah NaN NaN NaN NaN
df1 = pd.DataFrame({'A': [1, 2]})
df2 = pd.DataFrame({'B': [3, 4]})
df1
df2
df1 - df2
A B
0 NaN NaN
1 NaN NaN
Arithmetic methods with fill values
df1 = pd.DataFrame(np.arange(12.).reshape((3, 4)),
columns=list('abcd'))
df2 = pd.DataFrame(np.arange(20.).reshape((4, 5)),
columns=list('abcde'))
df2.loc[1, 'b'] = np.nan
df1
df2
a b c d e
0 0.0 1.0 2.0 3.0 4.0
1 5.0 NaN 7.0 8.0 9.0
2 10.0 11.0 12.0 13.0 14.0
3 15.0 16.0 17.0 18.0 19.0
df1 + df2
a b c d e
0 0.0 2.0 4.0 6.0 NaN
1 9.0 NaN 13.0 15.0 NaN
2 18.0 20.0 22.0 24.0 NaN
3 NaN NaN NaN NaN NaN
df1.add(df2, fill_value=0)
a b c d e
0 0.0 2.0 4.0 6.0 4.0
1 9.0 5.0 13.0 15.0 9.0
2 18.0 20.0 22.0 24.0 14.0
3 15.0 16.0 17.0 18.0 19.0
1 / df1
df1.rdiv(1)
a b c d
0 inf 1.000000 0.500000 0.333333
1 0.250000 0.200000 0.166667 0.142857
2 0.125000 0.111111 0.100000 0.090909
df1.reindex(columns=df2.columns, fill_value=0)
a b c d e
0 0.0 1.0 2.0 3.0 0
1 4.0 5.0 6.0 7.0 0
2 8.0 9.0 10.0 11.0 0
Operations between DataFrame and Series
arr = np.arange(12.).reshape((3, 4))
arr
arr[0]
arr - arr[0]
array([[0., 0., 0., 0.],
[4., 4., 4., 4.],
[8., 8., 8., 8.]])
frame = pd.DataFrame(np.arange(12.).reshape((4, 3)),
columns=list('bde'),
index=['Utah', 'Ohio', 'Texas', 'Oregon'])
series = frame.iloc[0]
frame
series
b 0.0
d 1.0
e 2.0
Name: Utah, dtype: float64
frame - series
b d e
Utah 0.0 0.0 0.0
Ohio 3.0 3.0 3.0
Texas 6.0 6.0 6.0
Oregon 9.0 9.0 9.0
series2 = pd.Series(range(3), index=['b', 'e', 'f'])
frame + series2
b d e f
Utah 0.0 NaN 3.0 NaN
Ohio 3.0 NaN 6.0 NaN
Texas 6.0 NaN 9.0 NaN
Oregon 9.0 NaN 12.0 NaN
series3 = frame['d']
frame
series3
frame.sub(series3, axis='index')
b d e
Utah -1.0 0.0 1.0
Ohio -1.0 0.0 1.0
Texas -1.0 0.0 1.0
Oregon -1.0 0.0 1.0
Function Application and Mapping
# 函数应⽤和映射
frame = pd.DataFrame(np.random.randn(4, 3), columns=list('bde'),
index=['Utah', 'Ohio', 'Texas', 'Oregon'])
frame
np.abs(frame)
b d e
Utah 0.204708 0.478943 0.519439
Ohio 0.555730 1.965781 1.393406
Texas 0.092908 0.281746 0.769023
Oregon 1.246435 1.007189 1.296221
# 另⼀个常⻅的操作是,将函数应⽤到由各列或⾏所形成的⼀维数
# 组上。DataFrame的apply⽅法即可实现此功能:
f = lambda x: x.max() - x.min()
frame.apply(f)
b 1.802165
d 1.684034
e 2.689627
dtype: float64
frame.apply(f, axis='columns')
Utah 0.998382
Ohio 2.521511
Texas 0.676115
Oregon 2.542656
dtype: float64
def f(x):
return pd.Series([x.min(), x.max()], index=['min', 'max'])
frame.apply(f)
b d e
min -0.555730 0.281746 -1.296221
max 1.246435 1.965781 1.393406
format = lambda x: '%.2f' % x
frame.applymap(format)
b d e
Utah -0.20 0.48 -0.52
Ohio -0.56 1.97 1.39
Texas 0.09 0.28 0.77
Oregon 1.25 1.01 -1.30
frame['e'].map(format)
Utah -0.52
Ohio 1.39
Texas 0.77
Oregon -1.30
Name: e, dtype: object

Original: https://blog.51cto.com/u_10055401/5482242
Author: 六mo神剑
Title: python-pandas操作

原创文章受到原创版权保护。转载请注明出处:https://www.johngo689.com/513629/

转载文章受原作者版权保护。转载请注明原作者出处!

(0)

大家都在看

亲爱的 Coder【最近整理,可免费获取】👉 最新必读书单  | 👏 面试题下载  | 🌎 免费的AI知识星球