weixin_44337725 2021-02-26 11:52 采纳率: 0%
浏览 119
已结题

运行成功但是不出结果,请问各位大佬是怎么回事?

#  -*- encoding:utf-8 -*-
import sys
import urllib.request
import xlwt
import os
import xlrd
import HTMLParser

def read_excel(path):
    datas = []
    xlsx = xlrd.open_workbook(path)
    table = xlsx.sheets()[0]
    nrows = table.nrows
    for i in range(1,nrows):
        data = []
        row = table.row_values(i)
        for cell in row:
            data.append(cell)
        datas.append(data)
    return datas

def save_data(data,path,charset = 'utf-8'):
    xlsx = xlwt.Workbook()
    table = xlsx.add_sheet('DATA')
    num = 0
    for row in range(len(data)):
        #print row,row - num * 65535
        num = row / 65535
        if row % 65535 == 0 and num > 0:            
            xlsx.save(path + '-' + str(num) + '.xls')
            #print path + '-' + str(num) + '.xls'
            xlsx = xlwt.Workbook()
            table = xlsx.add_sheet('DATA')
        for col in range(len(data[row])):
            try:
                table.write(row - num * 65535,col,data[row][col].decode(charset,'ignore'))
            except:
                table.write(row - num * 65535,col,data[row][col])
    xlsx.save(path + '.xls')
    
def save_page(path,pmid,html):
    if not os.path.exists(path):
        os.makedirs(path)
    path = path + pmid + '.html'
    html_file = open(path,'w')
    try:
        html_file.write(html)
        print (pmid,' save success!')
    finally:
        html_file.close()
        
def get_dir(path):
    file_names = []
    if not os.path.exists(path):
        return []
    for filename in os.listdir(path):
        file_names.append(filename)
    return file_names

def drop_label(string):
    while True:
        start_index = string.find('<')
        if start_index == -1:
            break
        end_index = string.find('>',start_index)
        if end_index == -1:
            break
        string = string[:start_index] + string[end_index+1:]
    return string

def drop_space(string):
    while string.find(' ') != -1:
        string = string.replace(' ','')
    while string.find("\t") != -1:
        string = string.replace("\t",'')
    while string.find("\r\n\r\n") != -1:
        string = string.replace("\r\n\r\n",'\r\n')
    while string.find('&nbsp;') != -1:
        string = string.replace('&nbsp;','')
    return string

def HtmlParsing(string):
    
    return string

def begin_search(path):
    nums = read_excel(path)
    print('Search Number : ', len(nums))
    files = get_dir('.\\Protein_Pages\\')
    url_head = r'http://www.uniprot.org/uniprot/'
    data = []
    for num in nums:
        n = num[0]
        if n == '':
            continue
        print ('Protein',nums.index(num),':',n)
        url = url_head + n
        if n + '.html' in files:
            html = open('.\\Protein_Pages\\' + n + '.html','r').read()
        else:
            try:
                html = urllib.request.urlopen(url).read()
                save_page('.\\Protein_Pages\\',n,html)
            except:
                print (n + ' : Connection Failed!')
                continue
        GMF = ''
        KBP = ''
        Interaction = ''
        location = ''
        html = html.replace('&lt;','<')
        s = html.find('GO - Molecular function')
        if s != -1:
            s = html .find('</h4>',s)
            e = html .find('</span>',s)
            GMF = GMF + drop_label(html[s:e])
            s = GMF.find('Source:')
            GMF = GMF[:s]

        s = html.find('Go - Biological process')
        if s != -1:
            s = html .find('</h4>',s)
            e = html .find('</span>',s)
    
            KBP = KBP + drop_label(html[s:e])
    
        s = html.find('<div class="section " id="interaction">')
        if s != -1:
            e = html.find('</div>',s)
    
            content = html[s:e]
            s = content.find('<table class="databaseTable INTERACTION">')
    
            Interaction = ''     
            st = ''
            while True:
                s = content.find('</span>',s)
                if s == -1:
                    break
                if st != '':
                    st = st + ','
                e = content.find('<sup>',s)
    
                st = st + content[s+7:e] + ':'
                s = content.find('<a ',e)
                e = content.find('<br/>',s)
    
                st = st + drop_label(content[s:e])
                
            Interaction = st

        s = html.find('<div class="section " id="subcellular_location">')
        if s != -1:
            s = html.find('<div class="annotation">',s)
            if s != -1:
                e = html.find('</div>',s)

                content = html[s:e]
                e = 0
                while True:
                    s = content.find('<li>',e)

                    if s == -1:
                        break
                    if location != '':
                        location = location + ','
                    s = content.find('<a ',s)
                    e = content.find('</a>',s)
    
                    location = location + drop_label(content[s:e])

            if location == '':
                s = html.find('Keywords - Cellular component')
                if s != -1:
                    s = html.find('<span>',s)
                    e = html.find('</span>',s)
                    location = drop_label(html[s:e])

            elif location == '':
                s = html.find('<ul class="noNumbering cellular_component">')
                if s != -1:
                    e = html.find('</ul>',s)
                    content = html[s:e]
                    e = 0
                    while True:
                        s = content.find('<a ',e)
                        if s == -1:
                            break;
                        if location != '':
                            location = location + ','
                        e = content.find('</a>')
                        location = location + drop_label(content[s:e])

        data.append([n,url,GMF,KBP,Interaction,location])
    return data

if __name__=="__main__":
    path = 'D:\\Protein_Search\\test.xls'
    data = begin_search(path)
    save_data(data,path.replace('.xlsx','_Result'))

只是一个简单检索文献的工具

  • 写回答

1条回答 默认 最新

  • SoftwareTeacher 《编程之美》作者 2021-02-26 12:05
    关注

    代码请用 ”代码“ 控件插入。  另外请说明你的输入是什么, 单步执行的结果是什么

    评论

报告相同问题?

悬赏问题

  • ¥15 使用rabbitMQ 消息队列作为url源进行多线程爬取时,总有几个url没有处理的问题。
  • ¥85 maple软件,solve求反函数,出现rootof怎么办?
  • ¥15 求chat4.0解答一道线性规划题,用lingo编程运行,第一问要求写出数学模型和lingo语言编程模型,第二问第三问解答就行,我的ddl要到了谁来求了
  • ¥15 Ubuntu在安装序列比对软件STAR时出现报错如何解决
  • ¥50 树莓派安卓APK系统签名
  • ¥15 maple软件,用solve求反函数出现rootof,怎么办?
  • ¥65 汇编语言除法溢出问题
  • ¥15 Visual Studio问题
  • ¥20 求一个html代码,有偿
  • ¥100 关于使用MATLAB中copularnd函数的问题