名曰大喵 2022-12-19 06:52 采纳率: 0%
浏览 67

IndexError: list index out of range

在学习python爬虫时遇到一个问题:将数据写入xlsx时总显示“list index out of range”


# -*- coding = utf-8 -*-
from bs4 import BeautifulSoup
from urllib import parse
import multiprocessing
from selenium import webdriver
import re
import xlwt
import time
import requests
import variables


def main():

    flag = True
    while(flag):
        choose = int(input("请输入你的选择,1为指定工作岗位,2为全部工作岗位:"))
        if(choose == 1):
            single_job()
            flag = False
        elif(choose == 2):
            all_job()
            flag = False
        else:
            print("您输入的指令有误,请重新输入")
    spider_1()
    spider_2()
    spider_3()
    savedata()


def single_job():
    variables.keyword = input("请输入你要搜索的岗位关键字:")
    variables.Keyword = parse.quote(parse.quote(variables.keyword))
    variables.pagenum = int(input("请输入你要爬取的数据页数(一页五十条数据):"))
    for i in range(0, variables.pagenum):
        variables.list_baseurl.append(
            "https://search.51job.com/list/000000,000000,0000,00,9,99," + variables.Keyword + ",2," + str(
                i + 1) + ".html")


def all_job():
    variables.pagenum = int(input("请输入你要爬取的数据页数(一页五十条数据):"))
    for i in range(0, variables.pagenum):
        variables.list_baseurl.append(
            "https://search.51job.com/list/000000,000000,0000,00,9,99,+,2," + str(
                i + 1) + ".html")


def askURL(url):
    head = {
        "User-Agent": "Mozilla / 5.0(Windows NT 10.0;Win64;x64) AppleWebKit "
                      "/ 537.36(KHTML, likeGecko) Chrome / "
                      "87.0.4280.66Safari / 537.36"
    }
    if url == "暂未填写":
        return 0
    res = requests.get(url, headers=head)
    str = res.apparent_encoding
    if re.search('GB.*', str):
        str = 'GBK'
    html = ""
    res.encoding = str
    html = res.text
    return html


def askURL_base(baseurl):
    opt = webdriver.ChromeOptions()
    opt.add_argument('headless')
    driver = webdriver.Chrome(executable_path=r'chromedriver', options=opt)
    driver.get(baseurl)
    element = driver.page_source
    driver.quit()
    return element


def getData_1(element):

    bs = BeautifulSoup(element, "html.parser")
    resultList_a = bs.select("div.j_joblist > div.e > a.el")
    resultList_name = bs.select(
        "div.j_joblist > div.e > a.el > p.t >span.jname.at")
    resultList_time = bs.select(
        "div.j_joblist > div.e > a.el > p.t >span.time")
    resultList_salary = bs.select(
        "div.j_joblist > div.e > a.el > p.info >span.sal")
    resultList_position = bs.select(
        "div.j_joblist > div.e > a.el > p.info >span.d.at")
    resultList_subsidy = bs.select("div.j_joblist > div.e > a.el ")
    resultList_company = bs.select("div.j_joblist > div.e > div.er >a")
    resultList_type_num = bs.select("div.j_joblist > div.e > div.er >p.dc.at")
    resultList_company_project = bs.select(
        "div.j_joblist > div.e > div.er >p.int.at")

    for i in resultList_a:
        variables.list_a.append(i["href"])
    for item in resultList_name:
        item = str(item)
        variables.list_title.append(re.findall(re.compile(
            r'<span class="jname at" title=".*">(.*)</span>'), item))
    for item in resultList_time:
        item = str(item)
        variables.list_time.append(re.findall(
            re.compile(r'<span class="time">(.*)</span>'), item))
    for item in variables.list_time:
        item = str(item)
        variables.list_time_format.append(time.strftime('%Y', time.localtime(
            time.time())) + '-' + item.replace('发布\']', '').replace('[\'', ''))
    for item in resultList_salary:
        item = str(item)
        variables.list_salary.append(re.findall(
            re.compile(r'<span class="sal">(.*)</span>'), item))
    for item in resultList_position:
        item = str(item)
        variables.list_positon.append(re.findall(
            re.compile(r'<span class="d at">(.*)</span>'), item))
    for item in resultList_subsidy:
        item = str(item)
        m = re.findall(re.compile(
            r'<p class="tags" title="(.*)"><span>'), item)
        if (0 != len(m)):
            variables.list_subsidy.append(m)
        else:
            variables.list_subsidy.append("暂未填写")
    for item in resultList_company:
        item = str(item)
        variables.list_company_a.append(re.findall(
            re.compile(r'a class="cname at" href="(.*)" target'), item))
    for item in resultList_company:
        item = str(item)
        variables.list_company_name.append(
            re.findall(re.compile(r'title="(.*)">'), item))
    for item in resultList_type_num:
        item = str(item)
        variables.list_type_num.append(re.findall(
            re.compile(r'<p class="dc at">(.*)</p>'), item))
    for item in resultList_company_project:
        item = str(item)
        variables.list_company_project.append(re.findall(
            re.compile(r'<p class="int at">(.*)</p>'), item))


def getData_2(html):
    bs = BeautifulSoup(html, "html.parser")
    resultList_require = bs.select("div.bmsg.job_msg.inbox")
    if(len(resultList_require) != 0):
        for item in resultList_require:
            item = filter_tag(str(item))
            variables.list_require.append(item)
    else:
        variables.list_require.append("暂未填写")
    resultList_address = bs.select("div.bmsg.inbox>p.fp")
    if(len(resultList_address) != 0):
        for item in resultList_address:
            item = filter_tag(str(item))
            variables.list_address.append(item)
    else:
        variables.list_address.append("暂未填写")
    resultList_a_2 = bs.select(
        "div.tHeader.tHjob > div.in > div.cn > p.cname >a.catn")
    m = 0
    for item in resultList_a_2:
        m = m+1
    if(m == 0):
        variables.list_a_2.append("暂未填写")
        variables.list_company_address.append('暂未填写')
        variables.list_company_web.append('暂未填写')
    else:
        for i in resultList_a_2:
            variables.list_a_2.append(i["href"])


def getData_3(html):
    if html == 0:
        return 0
    bs = BeautifulSoup(html, "html.parser")
    resultList_company_message = bs.select(
        "div.tBorderTop_box.bmsg > div.inbox >p.fp ")
    if(len(resultList_company_message)) == 0:
        variables.list_company_address.append("暂未填写")
        variables.list_company_web.append("暂未填写")
    elif(len(resultList_company_message)) == 1:
        variables.list_company_address.append(
            filter_tag(str(resultList_company_message[0])))
        variables.list_company_web.append("暂未填写")
    else:
        variables.list_company_address.append(
            filter_tag(str(resultList_company_message[0])))
        variables.list_company_web.append(
            filter_tag(str(resultList_company_message[1])))


def spider_1():
    print("------------------------第一层爬取-------------------------")
    time_1 = time.time()
    pool = multiprocessing.Pool(multiprocessing.cpu_count())
    list = pool.map(askURL_base, variables.list_baseurl)
    for i in range(0, len(list)):
        print("----------------------第一层解析第" +
              str(i + 1) + "次----------------------")
        getData_1(list[i])
    pool.close()
    pool.join()
    print("------------------------爬取成功!-------------------------")
    print("第一层数据爬取耗时:", time.time() - time_1, "秒")


def spider_2():
    print("------------------------第二层爬取-------------------------")
    time_2 = time.time()
    pool = multiprocessing.Pool(multiprocessing.cpu_count())
    list1 = pool.map(askURL, variables.list_a)
    pool.close()
    pool.join()
    for i in range(0, len(list1)):
        print("----------------------第二层解析第" +
              str(i + 1) + "次----------------------")
        getData_2(list1[i])
    print("------------------------爬取成功!-------------------------")
    print("第二层数据爬取耗时:", time.time() - time_2, "秒")


def spider_3():
    print("------------------------第三层爬取-------------------------")
    time_3 = time.time()
    pool = multiprocessing.Pool(multiprocessing.cpu_count())
    list2 = pool.map(askURL, variables.list_a_2)
    pool.close()
    pool.join()
    for i in range(0, len(list2)):
        print("----------------------第三层解析第" +
              str(i + 1) + "次----------------------")
        getData_3(list2[i])
    print("------------------------爬取成功!-------------------------")
    print("第三层数据爬取耗时:", time.time() - time_3, "秒")


def savedata():
    book = xlwt.Workbook(encoding="utf-8", style_compression=0)
    sheet = book.add_sheet(variables.keyword+"招聘信息", cell_overwrite_ok=True)
    print("-------------------------写入数据-------------------------")
    col = ("招聘链接", "岗位名称", "发布时间", "岗位薪资", "工作地点", "福利政策", "公司信息链接",
           "公司名称", "公司类型、规模", "主营", "工作要求", "工作地址", "公司地址", "公司官网")
    for i in range(0, 13):
        if(i == 0 or i == 4 or i == 5 or i == 6 or i == 7 or i == 11 or i == 12 or i == 13):
            sheet.col(i).width = 10000
        if (i == 1 or i == 8 or i == 9):
            sheet.col(i).width = 6000
        if (i == 2 or i == 3):
            sheet.col(i).width = 3000
        if (i == 10):
            sheet.col(i).width = 65535
    for i in range(0, 14):
        sheet.write(0, i, col[i])  #列名
    for i in range(0, variables.pagenum*50):
        sheet.write(i + 1, 1, variables.list_title[i])      #这里开始报错
        sheet.write(i + 1, 2, variables.list_time_format[i])
        sheet.write(i + 1, 3, variables.list_salary[i])
        sheet.write(i + 1, 4, variables.list_positon[i])
        sheet.write(i + 1, 5, variables.list_subsidy[i])
        sheet.write(i + 1, 6, variables.list_company_a[i])
        sheet.write(i + 1, 7, variables.list_company_name[i])
        sheet.write(i + 1, 8, variables.list_type_num[i])
        sheet.write(i + 1, 9, variables.list_company_project[i])
        sheet.write(i + 1, 10, variables.list_require[i])
        sheet.write(i + 1, 11, variables.list_address[i])
        sheet.write(i + 1, 12, variables.list_company_address[i])
        sheet.write(i + 1, 13, variables.list_company_web[i])
        print("正在写入第", i+1, "条信息")
    Path = "./{}.xls".format(str(time.strftime('%Y-%m-%d ',
                             time.localtime(time.time())) + variables.keyword+'招聘信息'))
    book.save(Path)
    print("爬取完毕!")
    print("保存的文件路径为:" + Path)

**报错提示:sheet.write(i + 1, 1, variables.list_title[i])
IndexError: list index out of range

尝试过修改着两行for循环还是说越界**

#variables.py文件:

list_a = []
list_title = []
list_time = []
list_time_format = []
list_salary = []
list_positon = []
list_subsidy = []
list_company_a = []
list_company_name = []
list_type_num = []
list_company_project = []
list_require=[]
list_address=[]
list_a_2=[]
list_company_address=[]
list_company_web=[]
list_baseurl=[]
keyword = ""
pagenum = 0

print(len(list_a))

第一次学习,求不忙的彭于晏帮我这个白菜瞅瞅

  • 写回答

3条回答 默认 最新

  • 於黾 2022-12-19 09:00
    关注

    variables.pagenum*50是个什么东西,一页50条,是你想要爬取的数据数量,真正爬取到的数据数量也是这么多吗,如果少于这个那肯定越界了呀

    评论

报告相同问题?

问题事件

  • 修改了问题 12月19日
  • 创建了问题 12月19日

悬赏问题

  • ¥15 2024电赛H题指导
  • ¥15 第三方如何控制E8a进行烧录
  • ¥15 关于lua调用DLL的c/c++动态库(相关搜索:数据库)
  • ¥15 openwrt结合智能家居(相关搜索:路由器)
  • ¥15 求一款免费的pdf编辑js,web端用的
  • ¥15 求分析下图晶体与三极管组成的振荡电路
  • ¥100 多线程+连接池+代理 运行一段时间线程阻塞
  • ¥15 关于#单片机#的问题:求一个使用C语言将重力加速度gx,gy,gz积分获取到速度的代码(相关搜索:c语言)
  • ¥15 matlab导致电脑重启问题
  • ¥20 为何R语言love图显示的分类变量点与smd值不一致