weixin_44337725
weixin_44337725
采纳率0%
2021-02-26 11:52

运行成功但是不出结果,请问各位大佬是怎么回事?

已结题
#  -*- encoding:utf-8 -*-
import sys
import urllib.request
import xlwt
import os
import xlrd
import HTMLParser

def read_excel(path):
    datas = []
    xlsx = xlrd.open_workbook(path)
    table = xlsx.sheets()[0]
    nrows = table.nrows
    for i in range(1,nrows):
        data = []
        row = table.row_values(i)
        for cell in row:
            data.append(cell)
        datas.append(data)
    return datas

def save_data(data,path,charset = 'utf-8'):
    xlsx = xlwt.Workbook()
    table = xlsx.add_sheet('DATA')
    num = 0
    for row in range(len(data)):
        #print row,row - num * 65535
        num = row / 65535
        if row % 65535 == 0 and num > 0:            
            xlsx.save(path + '-' + str(num) + '.xls')
            #print path + '-' + str(num) + '.xls'
            xlsx = xlwt.Workbook()
            table = xlsx.add_sheet('DATA')
        for col in range(len(data[row])):
            try:
                table.write(row - num * 65535,col,data[row][col].decode(charset,'ignore'))
            except:
                table.write(row - num * 65535,col,data[row][col])
    xlsx.save(path + '.xls')
    
def save_page(path,pmid,html):
    if not os.path.exists(path):
        os.makedirs(path)
    path = path + pmid + '.html'
    html_file = open(path,'w')
    try:
        html_file.write(html)
        print (pmid,' save success!')
    finally:
        html_file.close()
        
def get_dir(path):
    file_names = []
    if not os.path.exists(path):
        return []
    for filename in os.listdir(path):
        file_names.append(filename)
    return file_names

def drop_label(string):
    while True:
        start_index = string.find('<')
        if start_index == -1:
            break
        end_index = string.find('>',start_index)
        if end_index == -1:
            break
        string = string[:start_index] + string[end_index+1:]
    return string

def drop_space(string):
    while string.find(' ') != -1:
        string = string.replace(' ','')
    while string.find("\t") != -1:
        string = string.replace("\t",'')
    while string.find("\r\n\r\n") != -1:
        string = string.replace("\r\n\r\n",'\r\n')
    while string.find('&nbsp;') != -1:
        string = string.replace('&nbsp;','')
    return string

def HtmlParsing(string):
    
    return string

def begin_search(path):
    nums = read_excel(path)
    print('Search Number : ', len(nums))
    files = get_dir('.\\Protein_Pages\\')
    url_head = r'http://www.uniprot.org/uniprot/'
    data = []
    for num in nums:
        n = num[0]
        if n == '':
            continue
        print ('Protein',nums.index(num),':',n)
        url = url_head + n
        if n + '.html' in files:
            html = open('.\\Protein_Pages\\' + n + '.html','r').read()
        else:
            try:
                html = urllib.request.urlopen(url).read()
                save_page('.\\Protein_Pages\\',n,html)
            except:
                print (n + ' : Connection Failed!')
                continue
        GMF = ''
        KBP = ''
        Interaction = ''
        location = ''
        html = html.replace('&lt;','<')
        s = html.find('GO - Molecular function')
        if s != -1:
            s = html .find('</h4>',s)
            e = html .find('</span>',s)
            GMF = GMF + drop_label(html[s:e])
            s = GMF.find('Source:')
            GMF = GMF[:s]

        s = html.find('Go - Biological process')
        if s != -1:
            s = html .find('</h4>',s)
            e = html .find('</span>',s)
    
            KBP = KBP + drop_label(html[s:e])
    
        s = html.find('<div class="section " id="interaction">')
        if s != -1:
            e = html.find('</div>',s)
    
            content = html[s:e]
            s = content.find('<table class="databaseTable INTERACTION">')
    
            Interaction = ''     
            st = ''
            while True:
                s = content.find('</span>',s)
                if s == -1:
                    break
                if st != '':
                    st = st + ','
                e = content.find('<sup>',s)
    
                st = st + content[s+7:e] + ':'
                s = content.find('<a ',e)
                e = content.find('<br/>',s)
    
                st = st + drop_label(content[s:e])
                
            Interaction = st

        s = html.find('<div class="section " id="subcellular_location">')
        if s != -1:
            s = html.find('<div class="annotation">',s)
            if s != -1:
                e = html.find('</div>',s)

                content = html[s:e]
                e = 0
                while True:
                    s = content.find('<li>',e)

                    if s == -1:
                        break
                    if location != '':
                        location = location + ','
                    s = content.find('<a ',s)
                    e = content.find('</a>',s)
    
                    location = location + drop_label(content[s:e])

            if location == '':
                s = html.find('Keywords - Cellular component')
                if s != -1:
                    s = html.find('<span>',s)
                    e = html.find('</span>',s)
                    location = drop_label(html[s:e])

            elif location == '':
                s = html.find('<ul class="noNumbering cellular_component">')
                if s != -1:
                    e = html.find('</ul>',s)
                    content = html[s:e]
                    e = 0
                    while True:
                        s = content.find('<a ',e)
                        if s == -1:
                            break;
                        if location != '':
                            location = location + ','
                        e = content.find('</a>')
                        location = location + drop_label(content[s:e])

        data.append([n,url,GMF,KBP,Interaction,location])
    return data

if __name__=="__main__":
    path = 'D:\\Protein_Search\\test.xls'
    data = begin_search(path)
    save_data(data,path.replace('.xlsx','_Result'))

只是一个简单检索文献的工具

  • 点赞
  • 收藏
  • 复制链接分享

1条回答

  • SoftwareTeacher SoftwareTeacher 2月前

    代码请用 ”代码“ 控件插入。  另外请说明你的输入是什么, 单步执行的结果是什么

    点赞 评论 复制链接分享