Basic

import pandas as pd
print (f" Using {pd.__name__},Version {pd.__version__}")
 Using pandas,Version 0.23.0

创建空Dataframe

df = pd.DataFrame() 
print(df)
Empty DataFrame
Columns: []
Index: []

从Dict创建Dataframe

dict = {'name':["Tom", "Bob", "Mary", "James"], 
        'age': [18, 30, 25, 40], 
        'city':["Beijing", "ShangHai","GuangZhou", "ShenZhen"]} 
  
df = pd.DataFrame(dict) 
df

name

age

city

0

Tom

18

Beijing

1

Bob

30

ShangHai

2

Mary

25

GuangZhou

3

James

40

ShenZhen

index = pd.Index(["Tom", "Bob", "Mary", "James"],name = 'person')
cols = ['age','city']
data = [[18,'Beijing'],
        [30,'ShangHai'],
        [25,'GuangZhou'],
        [40,'ShenZhen']]

df =pd.DataFrame(index = index,data =data,columns = cols)
df

age

city

person

Tom

18

Beijing

Bob

30

ShangHai

Mary

25

GuangZhou

James

40

ShenZhen

对columns的基础操作

add column

dict = {'name':["Tom", "Bob", "Mary", "James"], 
        'age': [18, 30, 25, 40], 
        'city':["Beijing", "ShangHai","GuangZhou", "ShenZhen"]} 
  
df = pd.DataFrame(dict) 
df

name

age

city

0

Tom

18

Beijing

1

Bob

30

ShangHai

2

Mary

25

GuangZhou

3

James

40

ShenZhen

df['country'] = 'USA'
df

name

age

city

country

0

Tom

18

Beijing

USA

1

Bob

30

ShangHai

USA

2

Mary

25

GuangZhou

USA

3

James

40

ShenZhen

USA

df['adress'] = df['country']
df

name

age

city

country

adress

0

Tom

18

Beijing

USA

USA

1

Bob

30

ShangHai

USA

USA

2

Mary

25

GuangZhou

USA

USA

3

James

40

ShenZhen

USA

USA

Change column values

df['country'] = 'China'
df

name

age

city

country

adress

0

Tom

18

Beijing

China

USA

1

Bob

30

ShangHai

China

USA

2

Mary

25

GuangZhou

China

USA

3

James

40

ShenZhen

China

USA

df['adress'] = df['city']+','+ df['country']
df

name

age

city

country

adress

0

Tom

18

Beijing

China

Beijing,China

1

Bob

30

ShangHai

China

ShangHai,China

2

Mary

25

GuangZhou

China

GuangZhou,China

3

James

40

ShenZhen

China

ShenZhen,China

Delete columns

df.drop('country',axis=1, inplace=True)
del df['city']
df

name

age

adress

0

Tom

18

Beijing,China

1

Bob

30

ShangHai,China

2

Mary

25

GuangZhou,China

3

James

40

ShenZhen,China

Select columns

df['age']
0    18
1    30
2    25
3    40
Name: age, dtype: int64
df.name
0      Tom
1      Bob
2     Mary
3    James
Name: name, dtype: object
df[['age','name']]  

age

name

0

18

Tom

1

30

Bob

2

25

Mary

3

40

James

df.columns
Index(['name', 'age', 'adress'], dtype='object')

Rename columns

df.rename(index = str, columns = {'age':'Age','name':'Name','adress':'Adress'},inplace=True)
df

Name

Age

Adress

0

Tom

18

Beijing,China

1

Bob

30

ShangHai,China

2

Mary

25

GuangZhou,China

3

James

40

ShenZhen,China

df.rename(str.lower, axis='columns',inplace =True)
df

name

age

adress

0

Tom

18

Beijing,China

1

Bob

30

ShangHai,China

2

Mary

25

GuangZhou,China

3

James

40

ShenZhen,China

df.rename(str.capitalize, axis='columns',inplace =True)
df

Name

Age

Adress

0

Tom

18

Beijing,China

1

Bob

30

ShangHai,China

2

Mary

25

GuangZhou,China

3

James

40

ShenZhen,China

Set column value with conditions

df['Group'] = 'elderly'
df.loc[df['Age']<=18, 'Group'] = 'young'
df.loc[(df['Age'] >18) & (df['Age'] <= 30), 'Group'] = 'middle_aged'
df

Name

Age

Adress

Group

0

Tom

18

Beijing,China

young

1

Bob

30

ShangHai,China

middle_aged

2

Mary

25

GuangZhou,China

middle_aged

3

James

40

ShenZhen,China

elderly

对rows的基础操作

loc函数查询

df

Name

Age

Adress

Group

0

Tom

18

Beijing,China

young

1

Bob

30

ShangHai,China

middle_aged

2

Mary

25

GuangZhou,China

middle_aged

3

James

40

ShenZhen,China

elderly

df.loc[:]

Name

Age

Adress

Group

0

Tom

18

Beijing,China

young

1

Bob

30

ShangHai,China

middle_aged

2

Mary

25

GuangZhou,China

middle_aged

3

James

40

ShenZhen,China

elderly

loc函数条件查询

df.loc[df['Age']>20]

Name

Age

Adress

Group

1

Bob

30

ShangHai,China

middle_aged

2

Mary

25

GuangZhou,China

middle_aged

3

James

40

ShenZhen,China

elderly

loc函数条件行列查询

df.loc[df['Group']=='middle_aged','Name']
1     Bob
2    Mary
Name: Name, dtype: object

Where 查询

filter_adult = df['Age']>25
result = df.where(filter_adult)
result

Name

Age

Adress

Group

0

NaN

NaN

NaN

NaN

1

Bob

30.0

ShangHai,China

middle_aged

2

NaN

NaN

NaN

NaN

3

James

40.0

ShenZhen,China

elderly

Query 筛选

df

Name

Age

Adress

Group

0

Tom

18

Beijing,China

young

1

Bob

30

ShangHai,China

middle_aged

2

Mary

25

GuangZhou,China

middle_aged

3

James

40

ShenZhen,China

elderly

df.query('Group=="middle_aged"'and 'Age>30' )

Name

Age

Adress

Group

3

James

40

ShenZhen,China

elderly

Dataframe其他信息

df.shape
(4, 4)
df.describe()

Age

count

4.000000

mean

28.250000

std

9.251126

min

18.000000

25%

23.250000

50%

27.500000

75%

32.500000

max

40.000000

df.head(3)
df.tail(3)

Name

Age

Adress

Group

1

Bob

30

ShangHai,China

middle_aged

2

Mary

25

GuangZhou,China

middle_aged

3

James

40

ShenZhen,China

elderly

读写CSV

把df导出为CSV,不要index

df.to_csv('person.csv',index=None,sep=',')

读取CSV为dataframe

person = pd.read_csv('person.csv')
person

Name

Age

Adress

Group

0

Tom

18

Beijing,China

young

1

Bob

30

ShangHai,China

middle_aged

2

Mary

25

GuangZhou,China

middle_aged

3

James

40

ShenZhen,China

elderly

Last updated