最佳迭代方法:

1
2
3
4
5
import pandas as pd
from tqdm import tqdm

for row in tqdm(df.to_dict(orient="records")):
    # do something

获取行数和列数

1
2
3
4
5
6
7
import pandas as pd

rows = len(df.axes[0])
cols = len(df.axes[1])

rows = df.shape[0]
cols = df.shape[1]

分块读取超大文件

1
2
3
4
5
6
7
8
import pandas as pd
from tqdm import tqdm

data = pd.read_csv('dataset.csv', chunksize=1000)

for chunk in data:
    for row in tqdm(chunk.to_dict(orient="records")):
        # do something

根据已有单个列扩充新列

1
2
3
4
5
6
import pandas as pd

def valFunc(val):
    return val+1

df['D'] = df['C'].apply(valFunc)

自定义函数筛选

1
2
3
4
5
6
import pandas as pd

def filter_fn(row):
    #return True to keep, False to delete

df['new_col'] = df.apply(filter_fn, axis=1)

生成随机数据

1
2
3
4
import numpy as np
import pandas as pd

df = pd.DataFrame(np.random.randint(0,100,size=(10,4)), columns=list('ABCD'))

显示所有行和列

1
2
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

常用函数

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
import pandas as pd

# 读取时跳过错误行
df = pd.read_csv('dataset.csv', index_col=False, encoding='GB18030', on_bad_lines = 'skip')

# 删除一列
df1 = df.drop(['A'], axis=1)

# 删除一行
df1 = df.drop([1])

# 列不重复值
df['col'].unique()

# 删除重复行
df.drop_duplicates(keep='first', inplace=True)

# 筛选
df1 = df[df['col']=='val']

# 分组统计
df.groupby(['col1']).size().reset_index(name='counts')

# 乱序(行维度)
df = df.sample(frac=1).reset_index(drop=True)

# 保存时忽略序号
df.to_csv("output.csv", index=False)

# 导出Excel
df.to_excel('output.xlsx')