只对这个数据,还是要通用? 如果通用的话, 出现 这种日期, 20220325 20220301 , 如何处理?
我加了部分数据,为了测试效果
import pandas as pd
data_str = '''A 20220318
A 20220325
A 20220301
A 20220319
A 20220320
B 20220318
B 20220319
C 20220317
C 20220318
C 20220320
D 20220320'''
data_lst = [ n.split(" ") for n in data_str.split("\n")]
# 建立分析测试数据
df1 = pd.DataFrame(data_lst,columns=['项目','日期'], dtype=str)
#求重复次数
df2 = df1.groupby('项目', as_index=False).count()
df2.columns= ['项目','重复次数']
#求日期数量
df_rq = df1.groupby('日期', as_index=False).count()
rq_lst = [d[0] for d in df_rq.iterrows()]
rq_lst.sort()
# 处理过程
# 日期行转列
df1['data'] = df1['日期']
df4 = df1.set_index(['项目','日期']).unstack()
# 合并成 结果1
df5 = pd.merge(df2, df4, how='inner', on='项目')
col_name = ['项目','重复次数']
for n in range(1,len(rq_lst)+1):
col_name.append("日期{}".format(n))
df5.columns=col_name
# print(df5)
import datetime
def T(td):
td1 = datetime.datetime.strptime(td, '%Y%m%d').date()
td = datetime.datetime.strftime(td1 + datetime.timedelta(days=1), '%Y%m%d')
return td
def get_date_lst(data):
dt_str = []
for d in data:
if len(str(d)) == 8:
dt_str.append(d)
return ",".join(dt_str)
def get_date_status(data):
dt_lst = data[0].split(',')
i = 1
dt = dt_lst[0]
for d in dt_lst[1:]:
if d == T(dt):
i = i + 1
dt = d
else:
i = 1
dt = d
if i >= 3:
return "连续三日"
if i >=3:
return "连续三日"
else:
return "非连续三日"
# 结果2
df6 = pd.DataFrame.copy(df5,deep=True)
df6['日期'] = df6.apply(lambda x:get_date_lst(x), axis=1)
df6 = df6[['项目','重复次数','日期']]
df6['状态'] = df6[['日期']].apply(lambda x:get_date_status(x) , axis=1 )
print("数据")
print(df1)
print("结果1")
print(df5)
print("结果2")
print(df6)