代码
import pandas as pd
import openpyxl as op
data_a = pd.read_excel('a1.xlsx')
data_b = pd.read_excel('B1.xlsx')
#count_if函数
def countif(line, base1, base2, base3, base4, count_column):
line['cnt_if'] = sum(line[count_column] == base1 )+sum( line[count_column] == base2)+sum( line[count_column] == base3)+sum( line[count_column] == base4)
return line
#将count_if应用到每一行
wb = op.load_workbook( "B1.xlsx" )
sh=wb[ "Sheet1" ]
for item in data_b.index: #获取行坐标
count = 0 # 统计置零
for i in range( 1,5 ): #获取列坐标
col_name ="Y"+str(i) #生成列名
data_loc = data_b.loc[item, col_name] #定位数据坐标
# 给对比参数赋值
if col_name == 'Y1':
Y1=data_loc
elif col_name == 'Y2':
Y2=data_loc
elif col_name == 'Y3':
Y3=data_loc
else:
Y4=data_loc
# 数据对比
new_df = data_a.apply(countif, axis=1, args=( Y1, Y2, Y3, Y4, ['X1', 'X2', 'X3', 'X4','X5'] ))
count_tmp = sum(new_df['cnt_if'] == 4) # 统计每行相同数出现 4次的数量
if count_tmp > 0 :
count = count+count_tmp
print(count) #----
# 结果写入excel
sh.cell( item+2,7,count ) #写入表指定单元格
wb.save("B1.xlsx") #写入excel文件
这个代码效率太低,统计20万条数据要几个小时,有什么方法让效率高点吗?