# -*- encoding:utf-8 -*-
import sys
import urllib.request
import xlwt
import os
import xlrd
import HTMLParser
def read_excel(path):
datas = []
xlsx = xlrd.open_workbook(path)
table = xlsx.sheets()[0]
nrows = table.nrows
for i in range(1,nrows):
data = []
row = table.row_values(i)
for cell in row:
data.append(cell)
datas.append(data)
return datas
def save_data(data,path,charset = 'utf-8'):
xlsx = xlwt.Workbook()
table = xlsx.add_sheet('DATA')
num = 0
for row in range(len(data)):
#print row,row - num * 65535
num = row / 65535
if row % 65535 == 0 and num > 0:
xlsx.save(path + '-' + str(num) + '.xls')
#print path + '-' + str(num) + '.xls'
xlsx = xlwt.Workbook()
table = xlsx.add_sheet('DATA')
for col in range(len(data[row])):
try:
table.write(row - num * 65535,col,data[row][col].decode(charset,'ignore'))
except:
table.write(row - num * 65535,col,data[row][col])
xlsx.save(path + '.xls')
def save_page(path,pmid,html):
if not os.path.exists(path):
os.makedirs(path)
path = path + pmid + '.html'
html_file = open(path,'w')
try:
html_file.write(html)
print (pmid,' save success!')
finally:
html_file.close()
def get_dir(path):
file_names = []
if not os.path.exists(path):
return []
for filename in os.listdir(path):
file_names.append(filename)
return file_names
def drop_label(string):
while True:
start_index = string.find('<')
if start_index == -1:
break
end_index = string.find('>',start_index)
if end_index == -1:
break
string = string[:start_index] + string[end_index+1:]
return string
def drop_space(string):
while string.find(' ') != -1:
string = string.replace(' ','')
while string.find("\t") != -1:
string = string.replace("\t",'')
while string.find("\r\n\r\n") != -1:
string = string.replace("\r\n\r\n",'\r\n')
while string.find(' ') != -1:
string = string.replace(' ','')
return string
def HtmlParsing(string):
return string
def begin_search(path):
nums = read_excel(path)
print('Search Number : ', len(nums))
files = get_dir('.\\Protein_Pages\\')
url_head = r'http://www.uniprot.org/uniprot/'
data = []
for num in nums:
n = num[0]
if n == '':
continue
print ('Protein',nums.index(num),':',n)
url = url_head + n
if n + '.html' in files:
html = open('.\\Protein_Pages\\' + n + '.html','r').read()
else:
try:
html = urllib.request.urlopen(url).read()
save_page('.\\Protein_Pages\\',n,html)
except:
print (n + ' : Connection Failed!')
continue
GMF = ''
KBP = ''
Interaction = ''
location = ''
html = html.replace('<','<')
s = html.find('GO - Molecular function')
if s != -1:
s = html .find('</h4>',s)
e = html .find('</span>',s)
GMF = GMF + drop_label(html[s:e])
s = GMF.find('Source:')
GMF = GMF[:s]
s = html.find('Go - Biological process')
if s != -1:
s = html .find('</h4>',s)
e = html .find('</span>',s)
KBP = KBP + drop_label(html[s:e])
s = html.find('<div class="section " id="interaction">')
if s != -1:
e = html.find('</div>',s)
content = html[s:e]
s = content.find('<table class="databaseTable INTERACTION">')
Interaction = ''
st = ''
while True:
s = content.find('</span>',s)
if s == -1:
break
if st != '':
st = st + ','
e = content.find('<sup>',s)
st = st + content[s+7:e] + ':'
s = content.find('<a ',e)
e = content.find('<br/>',s)
st = st + drop_label(content[s:e])
Interaction = st
s = html.find('<div class="section " id="subcellular_location">')
if s != -1:
s = html.find('<div class="annotation">',s)
if s != -1:
e = html.find('</div>',s)
content = html[s:e]
e = 0
while True:
s = content.find('<li>',e)
if s == -1:
break
if location != '':
location = location + ','
s = content.find('<a ',s)
e = content.find('</a>',s)
location = location + drop_label(content[s:e])
if location == '':
s = html.find('Keywords - Cellular component')
if s != -1:
s = html.find('<span>',s)
e = html.find('</span>',s)
location = drop_label(html[s:e])
elif location == '':
s = html.find('<ul class="noNumbering cellular_component">')
if s != -1:
e = html.find('</ul>',s)
content = html[s:e]
e = 0
while True:
s = content.find('<a ',e)
if s == -1:
break;
if location != '':
location = location + ','
e = content.find('</a>')
location = location + drop_label(content[s:e])
data.append([n,url,GMF,KBP,Interaction,location])
return data
if __name__=="__main__":
path = 'D:\\Protein_Search\\test.xls'
data = begin_search(path)
save_data(data,path.replace('.xlsx','_Result'))
只是一个简单检索文献的工具