Pandas 数据分析代码示例——Python中数据处理库

/ Python / 没有评论 / 280浏览

pandas 方法

read_csv—读逗号分隔的数据

import pandas
A = pandas.read_csv("test.txt")
print(type(A))
print(A.dtypes)

<class 'pandas.core.frame.DataFrame'>
No. int64
Content object
Value int64
dtype: object

to_datetime—时间格式转化

import pandas
A = pandas.read_csv("test1.csv")
A["Date"] = pandas.to_datetime(A["Date"])
print(A.head(3))

No Content Values Values1 Date
0 1 a 111.0 32.0 2017-01-01
1 2 b NaN 45.0 2017-02-01
2 3 c 333.0 25.0 2017-03-01

DataFrame

DataFrame数据类型

类型意义P.S
ObjectString
intinteger
floatfloat
datetimetime
boolboolean

head—取前几行

A = pandas.read_csv("test.txt")
print(A.head(1))
print("====================================================")
print(A.head())

No. Content Value
0 1 a 111
——————————————————————————————————————————
No. Content Value
0 1 a 111
1 2 b 222
2 3 c 333
3 4 d 444

tail—取后几行

A = pandas.read_csv("test.txt")
print(A.tail(1))
print("====================================================")
print(A.tail())

No. Content Value
3 4 d 444
——————————————————————————————————————————
No. Content Value
0 1 a 111
1 2 b 222
2 3 c 333
3 4 d 444

columns—获取列名

A = pandas.read_csv("test.txt")
print(A.columns)

Index(['No.', 'Content', 'Value'], dtype='object')

shape—查看行列值

A = pandas.read_csv("test.txt")
print(A.shape)

(4, 3)

loc—定位取索引(行号)

A = pandas.read_csv("test.txt")
print(A.loc[3])
print("====================================================")
print(A.loc[[1,3]])
print("====================================================")
print(A.loc[1:3])

No. 4
Content d
Value 444
Name: 3, dtype: object
——————————————————————————————————————————
No. Content Value
1 2 b 222
3 4 d 444
——————————————————————————————————————————
No. Content Value
1 2 b 222
2 3 c 333
3 4 d 444

拿到某一列

A = pandas.read_csv("test.txt")
print(A["Content"])
print("====================================================")
print(A[["Content","Value"]])

0 a
1 b
2 c
3 d
Name: Content, dtype: object
——————————————————————————————————————————
Content Value
0 a 111
1 b 222
2 c 333
3 d 444

取最大(小)值

A = pandas.read_csv("test.txt")
print(A["Value"].max())
print(A["Value"].min())

444
111

sort_values—按列排序

A = pandas.read_csv("test.txt")
A.sort_values("Value", inplace = True)
print(A)
print("====================================================")
A.sort_values("Value", inplace = True, ascending = False)
print(A)

No. Content Value
0 1 a 111
1 2 b 222
2 3 c 333
3 4 d 444
——————————————————————————————————————————
No. Content Value
3 4 d 444
2 3 c 333
1 2 b 222
0 1 a 111

isnull—判断是否有缺失值

A = pandas.read_csv("test1.csv")
print(A)
print("====================================================")
values = A["Value"]
is_null_values = pandas.isnull(values)
print(is_null_values)
true_null = values[is_null_values]
print("====================================================")
print(true_null)
print("====================================================")
print(len(true_null))

No.\t Content Value
0 1 a 111.0
1 2 b NaN
2 3 c 333.0
3 4 d 444.0
——————————————————————————————————————————
0 False
1 True
2 False
3 False
Name: Value, dtype: bool
——————————————————————————————————————————
1 NaN
Name: Value, dtype: float64
——————————————————————————————————————————
1

mean—求列的均值

A = pandas.read_csv("test1.csv")
print(A["Values"].mean())

277.25

pivot_table—数据透视表

import numpy
A = pandas.read_csv("test1.csv")
print(A)
print("====================================================")
statistics = A.pivot_table(index="No", values="Values", aggfunc = numpy.mean)
print(statistics)
print("====================================================")
statistics = A.pivot_table(index="No", values=["Values", "Values1"], aggfunc = numpy.sum)
print(statistics)

No Content Values Values1
0 1 a 111.0 32.0
1 2 b NaN 45.0
2 3 c 333.0 25.0
3 4 d 444.0 76.0
4 1 e 221.0 NaN
——————————————————————————————————————————
Values
No
1 166.0
2 NaN
3 333.0
4 444.0
——————————————————————————————————————————
Values Values1
No
1 332.0 32.0
2 NaN 45.0
3 333.0 25.0
4 444.0 76.0

dropna—去掉含缺失值的行(列)

import numpy
A = pandas.read_csv("test1.csv")
drop_na_1 = A.dropna(axis = 1)
print(drop_na_1)
print("====================================================")
drop_na_0 = A.dropna(axis = 0, subset=["Values","Values1"])
print(drop_na_0)

No Content
0 1 a
1 2 b
2 3 c
3 4 d
4 1 e
——————————————————————————————————————————
No Content Values Values1
0 1 a 111.0 32.0
2 3 c 333.0 25.0
3 4 d 444.0 76.0

loc—定位

A = pandas.read_csv("test1.csv")
print(A.loc[2,"Values"])

333.0

reset_index—重建索引

A = pandas.read_csv("test1.csv")
A.sort_values("Values", inplace = True, ascending = False)
print(A)
print("====================================================")
A = A.reset_index(drop=True)
print(A)

No Content Values Values1
3 4 d 444.0 76.0
2 3 c 333.0 25.0
4 1 e 221.0 NaN
0 1 a 111.0 32.0
1 2 b NaN 45.0
——————————————————————————————————————————
No Content Values Values1
0 4 d 444.0 76.0
1 3 c 333.0 25.0
2 1 e 221.0 NaN
3 1 a 111.0 32.0
4 2 b NaN 45.0

apply—自定义函数

A = pandas.read_csv("test1.csv")
def third(matrix):
    return matrix.loc[2]
print(A.apply(third))

No 3
Content c
Values 333
Values1 25
dtype: object

Series

Series 构造

构造单列

A = pandas.read_csv("test1.csv")
print(type(A))
series = A["Values"]
print(type(series))
print(series[0:3])

<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.series.Series'>
0 111.0
1 NaN
2 333.0
Name: Values, dtype: float64

构造多列并制定索引

from pandas import Series
A = pandas.read_csv("test1.csv")
series = A["Content"]
content = series.values
series = A["Values"]
values = series.values
series = Series(values,index = content)
result = series[["c", 333]]
print(result)
print("====================================================")
result = series[0:4]
print(result)

c 333.0
333 NaN
dtype: float64
——————————————————————————————————————————
a 111.0
b NaN
c 333.0
d 444.0
dtype: float64

Series 应用

按列排序

A = pandas.read_csv("test1.csv")
series = A["Content"]
content = series.values
series = A["Values"]
values = series.values
series = Series(values,index = content)
index = series.index.tolist()
sorted_index = sorted(index)
series = series.reindex(sorted_index)
print(series)
print("====================================================")
series = Series(values,index = content)
sorted_by_index = series.sort_index()
print(sorted_by_index)
print("====================================================")
sorted_by_values = series.sort_values()
print(sorted_by_values)

a 111.0
b NaN
c 333.0
d 444.0
e 221.0
dtype: float64
——————————————————————————————————————————
a 111.0
b NaN
c 333.0
d 444.0
e 221.0
dtype: float64
——————————————————————————————————————————
a 111.0
e 221.0
c 333.0
d 444.0
b NaN
dtype: float64

series 与 numpy 混用

A = pandas.read_csv("test1.csv")
series = A["Content"]
content = series.values
series = A["Values"]
values = series.values
series = A["Values1"]
values1 = series.values
result1 = Series(values,index = content)
result2 = Series(values1,index = content)
print(numpy.add(result2, result1))

a 143.0
b NaN
c 358.0
d 520.0
e NaN
dtype: float64

求两列均值

A = pandas.read_csv("test1.csv")
series = A["Content"]
content = series.values
series = A["Values"]
values = series.values
series = A["Values1"]
values1 = series.values
result1 = Series(values,index = content)
result2 = Series(values1,index = content)
mean = (result1 + result2)/ 2
print(mean)

a 71.5
b NaN
c 179.0
d 260.0
e NaN
dtype: float64