代码:
import xlrd
import time
def readxls(path,colnum):#读取excel数据
xl = xlrd.open_workbook(path)
table = xl.sheets()[0]
data = list(table.col_values(colnum))
return data
def datagjc(data):#返回不重复的关键词
list1 = []
data_set = set()
for i in data:
list1.extend(i.split('/'))
data_set = set(list1)
return data_set
def gxjz(data_set):#构建矩阵框架
i = len(data_set)+1#共现矩阵对角线数据为0,建立长度为关键词数量+1的二维列表
list1 = [[0 for col in range(i)] for row in range(i)]
n = 1#对角线数据为0,从第二位开始录入数据
for row_1 in data_set:
list1[0][n] = row_1#第一个列表,输出后txt内第一行
n+=1
if n == i:#最后一行录入完毕
break
m = 1
for cols in data_set:
list1[m][0] = cols#每列表第一个数据,txt内第一列
m += 1
if m == i:
break
return list1
def tcsj(data,list1):#填充矩阵数据
data1 = []
for i in data:
data1.append(i.split('/'))
for row in range(1,len(list1)):#[0][0]位置为0,从1开始计算
for col in range(1,len(list1)):
if row == col:#对角线
continue
else:
counter = 0
for i in data1:
if list1[col][0] in i and list1[row][0] in i:
counter += 1
list1[row][col] = counter
return list1
def txt(list2,path):
with open(path,'w') as f:
for a in list2:
for b in a:#直接录入会保持列表格式,需要将其拆开
f.write(str(b)+'\t\t')#加大数据间隔
f.write('\n')#换行
f.close()
def main():
start = time.time()
path1 = r'C:\Users\wy\Desktop\test.xlsx'
path2 = r'C:C:\Users\wy\Desktop\test1.txt'
data = readxls(path1,2)
data_set = datagjc(data)
list1 = gxjz(data_set)
list2 = tcsj(data,list1)
txt(list2,path2)
end = time.time()
print('代码运行时间: %s s'%(end-start))
if __name__ == "__main__":
main()
运行结果:
IOError Traceback (most recent call last)
<ipython-input-10-98ae6b197541> in <module>()
70
71 if __name__ == "__main__":
---> 72 main()
73
74
<ipython-input-10-98ae6b197541> in main()
65 list1 = gxjz(data_set)
66 list2 = tcsj(data,list1)
---> 67 txt(list2,path2)
68 end = time.time()
69 print('代码运行时间: %s s'%(end-start))
<ipython-input-10-98ae6b197541> in txt(list2, path)
50
51 def txt(list2,path):
---> 52 with open(path,'w') as f:
53 for a in list2:
54 for b in a:#直接录入会保持列表格式,需要将其拆开
IOError: [Errno 22] invalid mode ('w') or filename: 'C:C:\\Users\\wy\\Desktop\\test1.txt'