IndexError: list index out of range

在学习python爬虫时遇到一个问题：将数据写入xlsx时总显示“list index out of range”


# -*- coding = utf-8 -*-
from bs4 import BeautifulSoup
from urllib import parse
import multiprocessing
from selenium import webdriver
import re
import xlwt
import time
import requests
import variables


def main():

    flag = True
    while(flag):
        choose = int(input("请输入你的选择，1为指定工作岗位，2为全部工作岗位："))
        if(choose == 1):
            single_job()
            flag = False
        elif(choose == 2):
            all_job()
            flag = False
        else:
            print("您输入的指令有误，请重新输入")
    spider_1()
    spider_2()
    spider_3()
    savedata()


def single_job():
    variables.keyword = input("请输入你要搜索的岗位关键字：")
    variables.Keyword = parse.quote(parse.quote(variables.keyword))
    variables.pagenum = int(input("请输入你要爬取的数据页数（一页五十条数据）："))
    for i in range(0, variables.pagenum):
        variables.list_baseurl.append(
            "https://search.51job.com/list/000000,000000,0000,00,9,99," + variables.Keyword + ",2," + str(
                i + 1) + ".html")


def all_job():
    variables.pagenum = int(input("请输入你要爬取的数据页数（一页五十条数据）："))
    for i in range(0, variables.pagenum):
        variables.list_baseurl.append(
            "https://search.51job.com/list/000000,000000,0000,00,9,99,+,2," + str(
                i + 1) + ".html")


def askURL(url):
    head = {
        "User-Agent": "Mozilla / 5.0(Windows NT 10.0;Win64;x64) AppleWebKit "
                      "/ 537.36(KHTML, likeGecko) Chrome / "
                      "87.0.4280.66Safari / 537.36"
    }
    if url == "暂未填写":
        return 0
    res = requests.get(url, headers=head)
    str = res.apparent_encoding
    if re.search('GB.*', str):
        str = 'GBK'
    html = ""
    res.encoding = str
    html = res.text
    return html


def askURL_base(baseurl):
    opt = webdriver.ChromeOptions()
    opt.add_argument('headless')
    driver = webdriver.Chrome(executable_path=r'chromedriver', options=opt)
    driver.get(baseurl)
    element = driver.page_source
    driver.quit()
    return element


def getData_1(element):

    bs = BeautifulSoup(element, "html.parser")
    resultList_a = bs.select("div.j_joblist > div.e > a.el")
    resultList_name = bs.select(
        "div.j_joblist > div.e > a.el > p.t >span.jname.at")
    resultList_time = bs.select(
        "div.j_joblist > div.e > a.el > p.t >span.time")
    resultList_salary = bs.select(
        "div.j_joblist > div.e > a.el > p.info >span.sal")
    resultList_position = bs.select(
        "div.j_joblist > div.e > a.el > p.info >span.d.at")
    resultList_subsidy = bs.select("div.j_joblist > div.e > a.el ")
    resultList_company = bs.select("div.j_joblist > div.e > div.er >a")
    resultList_type_num = bs.select("div.j_joblist > div.e > div.er >p.dc.at")
    resultList_company_project = bs.select(
        "div.j_joblist > div.e > div.er >p.int.at")

    for i in resultList_a:
        variables.list_a.append(i["href"])
    for item in resultList_name:
        item = str(item)
        variables.list_title.append(re.findall(re.compile(
            r'<span class="jname at" title=".*">(.*)</span>'), item))
    for item in resultList_time:
        item = str(item)
        variables.list_time.append(re.findall(
            re.compile(r'<span class="time">(.*)</span>'), item))
    for item in variables.list_time:
        item = str(item)
        variables.list_time_format.append(time.strftime('%Y', time.localtime(
            time.time())) + '-' + item.replace('发布\']', '').replace('[\'', ''))
    for item in resultList_salary:
        item = str(item)
        variables.list_salary.append(re.findall(
            re.compile(r'<span class="sal">(.*)</span>'), item))
    for item in resultList_position:
        item = str(item)
        variables.list_positon.append(re.findall(
            re.compile(r'<span class="d at">(.*)</span>'), item))
    for item in resultList_subsidy:
        item = str(item)
        m = re.findall(re.compile(
            r'<p class="tags" title="(.*)"><span>'), item)
        if (0 != len(m)):
            variables.list_subsidy.append(m)
        else:
            variables.list_subsidy.append("暂未填写")
    for item in resultList_company:
        item = str(item)
        variables.list_company_a.append(re.findall(
            re.compile(r'a class="cname at" href="(.*)" target'), item))
    for item in resultList_company:
        item = str(item)
        variables.list_company_name.append(
            re.findall(re.compile(r'title="(.*)">'), item))
    for item in resultList_type_num:
        item = str(item)
        variables.list_type_num.append(re.findall(
            re.compile(r'<p class="dc at">(.*)</p>'), item))
    for item in resultList_company_project:
        item = str(item)
        variables.list_company_project.append(re.findall(
            re.compile(r'<p class="int at">(.*)</p>'), item))


def getData_2(html):
    bs = BeautifulSoup(html, "html.parser")
    resultList_require = bs.select("div.bmsg.job_msg.inbox")
    if(len(resultList_require) != 0):
        for item in resultList_require:
            item = filter_tag(str(item))
            variables.list_require.append(item)
    else:
        variables.list_require.append("暂未填写")
    resultList_address = bs.select("div.bmsg.inbox>p.fp")
    if(len(resultList_address) != 0):
        for item in resultList_address:
            item = filter_tag(str(item))
            variables.list_address.append(item)
    else:
        variables.list_address.append("暂未填写")
    resultList_a_2 = bs.select(
        "div.tHeader.tHjob > div.in > div.cn > p.cname >a.catn")
    m = 0
    for item in resultList_a_2:
        m = m+1
    if(m == 0):
        variables.list_a_2.append("暂未填写")
        variables.list_company_address.append('暂未填写')
        variables.list_company_web.append('暂未填写')
    else:
        for i in resultList_a_2:
            variables.list_a_2.append(i["href"])


def getData_3(html):
    if html == 0:
        return 0
    bs = BeautifulSoup(html, "html.parser")
    resultList_company_message = bs.select(
        "div.tBorderTop_box.bmsg > div.inbox >p.fp ")
    if(len(resultList_company_message)) == 0:
        variables.list_company_address.append("暂未填写")
        variables.list_company_web.append("暂未填写")
    elif(len(resultList_company_message)) == 1:
        variables.list_company_address.append(
            filter_tag(str(resultList_company_message[0])))
        variables.list_company_web.append("暂未填写")
    else:
        variables.list_company_address.append(
            filter_tag(str(resultList_company_message[0])))
        variables.list_company_web.append(
            filter_tag(str(resultList_company_message[1])))


def spider_1():
    print("------------------------第一层爬取-------------------------")
    time_1 = time.time()
    pool = multiprocessing.Pool(multiprocessing.cpu_count())
    list = pool.map(askURL_base, variables.list_baseurl)
    for i in range(0, len(list)):
        print("----------------------第一层解析第" +
              str(i + 1) + "次----------------------")
        getData_1(list[i])
    pool.close()
    pool.join()
    print("------------------------爬取成功！-------------------------")
    print("第一层数据爬取耗时：", time.time() - time_1, "秒")


def spider_2():
    print("------------------------第二层爬取-------------------------")
    time_2 = time.time()
    pool = multiprocessing.Pool(multiprocessing.cpu_count())
    list1 = pool.map(askURL, variables.list_a)
    pool.close()
    pool.join()
    for i in range(0, len(list1)):
        print("----------------------第二层解析第" +
              str(i + 1) + "次----------------------")
        getData_2(list1[i])
    print("------------------------爬取成功！-------------------------")
    print("第二层数据爬取耗时：", time.time() - time_2, "秒")


def spider_3():
    print("------------------------第三层爬取-------------------------")
    time_3 = time.time()
    pool = multiprocessing.Pool(multiprocessing.cpu_count())
    list2 = pool.map(askURL, variables.list_a_2)
    pool.close()
    pool.join()
    for i in range(0, len(list2)):
        print("----------------------第三层解析第" +
              str(i + 1) + "次----------------------")
        getData_3(list2[i])
    print("------------------------爬取成功！-------------------------")
    print("第三层数据爬取耗时：", time.time() - time_3, "秒")


def savedata():
    book = xlwt.Workbook(encoding="utf-8", style_compression=0)
    sheet = book.add_sheet(variables.keyword+"招聘信息", cell_overwrite_ok=True)
    print("-------------------------写入数据-------------------------")
    col = ("招聘链接", "岗位名称", "发布时间", "岗位薪资", "工作地点", "福利政策", "公司信息链接",
           "公司名称", "公司类型、规模", "主营", "工作要求", "工作地址", "公司地址", "公司官网")
    for i in range(0, 13):
        if(i == 0 or i == 4 or i == 5 or i == 6 or i == 7 or i == 11 or i == 12 or i == 13):
            sheet.col(i).width = 10000
        if (i == 1 or i == 8 or i == 9):
            sheet.col(i).width = 6000
        if (i == 2 or i == 3):
            sheet.col(i).width = 3000
        if (i == 10):
            sheet.col(i).width = 65535
    for i in range(0, 14):
        sheet.write(0, i, col[i])  #列名
    for i in range(0, variables.pagenum*50):
        sheet.write(i + 1, 1, variables.list_title[i])      #这里开始报错
        sheet.write(i + 1, 2, variables.list_time_format[i])
        sheet.write(i + 1, 3, variables.list_salary[i])
        sheet.write(i + 1, 4, variables.list_positon[i])
        sheet.write(i + 1, 5, variables.list_subsidy[i])
        sheet.write(i + 1, 6, variables.list_company_a[i])
        sheet.write(i + 1, 7, variables.list_company_name[i])
        sheet.write(i + 1, 8, variables.list_type_num[i])
        sheet.write(i + 1, 9, variables.list_company_project[i])
        sheet.write(i + 1, 10, variables.list_require[i])
        sheet.write(i + 1, 11, variables.list_address[i])
        sheet.write(i + 1, 12, variables.list_company_address[i])
        sheet.write(i + 1, 13, variables.list_company_web[i])
        print("正在写入第", i+1, "条信息")
    Path = "./{}.xls".format(str(time.strftime('%Y-%m-%d ',
                             time.localtime(time.time())) + variables.keyword+'招聘信息'))
    book.save(Path)
    print("爬取完毕！")
    print("保存的文件路径为：" + Path)

**报错提示：sheet.write(i + 1, 1, variables.list_title[i])
IndexError: list index out of range

尝试过修改着两行for循环还是说越界**

#variables.py文件：

list_a = []
list_title = []
list_time = []
list_time_format = []
list_salary = []
list_positon = []
list_subsidy = []
list_company_a = []
list_company_name = []
list_type_num = []
list_company_project = []
list_require=[]
list_address=[]
list_a_2=[]
list_company_address=[]
list_company_web=[]
list_baseurl=[]
keyword = ""
pagenum = 0

print(len(list_a))

第一次学习，求不忙的彭于晏帮我这个白菜瞅瞅

写回答
好问题 0 提建议
追加酬金
关注问题
分享
邀请回答
编辑收藏删除结题
收藏举报

3条回答默认最新

关注

码龄粉丝数原力等级 --

被采纳

被点赞

采纳率
於黾 2022-12-19 09:00
关注
variables.pagenum*50是个什么东西，一页50条，是你想要爬取的数据数量，真正爬取到的数据数量也是这么多吗，如果少于这个那肯定越界了呀

解决无用
评论打赏
分享
举报

评论

按下Enter换行，Ctrl+Enter发表内容

报告相同问题？

关注问题

IndexError: tuple index out of range dnn python 人工智能有问必答
2022-04-23 23:43

回答 2 已采纳 IndexError: tuple index out of range是元组的下标越界了X_train.shape[1]是访问元组的第2个元素你用len()输出下 X_train.shape 长度看
python运行问题IndexError: tuple index out of range python 有问必答
2022-03-18 17:09

回答 3 已采纳 data1.append((start_num[i], end_num[i], data[i][75]))中data[i][75]是取每行的第76个字段, 你数据库中有76个字段吗没有就是tuple
#python#抛错：IndexError: list index out of range python
2021-11-14 21:24

回答 2 已采纳行吧我已经弃坑了
已解决IndexError: list index out of range异常的正确解决方法，亲测有效！！！
2024-02-13 04:30

小明的博客已解决IndexError: list index out of range异常的正确解决方法，亲测有效！！！
python爬取证监会行政监管措施遭遇 IndexError: list index out of range python 有问必答爬虫
2022-01-25 13:06

回答 3 已采纳 http://www.csrc.gov.cn/searchList/58959eb1bd68458088cac63f46a5fa40?_isAgg=true&_isJson=true&_pageSiz
python IndexError: list index out of range python 有问必答
2021-06-01 23:24

回答 4 已采纳代码没问题，应该是你输入格式不对，要分三行输入，每行三个数 split()默认是用空格分割字符串的每行三个数之间要加了一个空格。不是逗号。
爬虫学习时错误IndexError: list index out of range（列表索引超出范围） python 有问必答
2021-10-01 17:23

回答 1 已采纳去掉break。。要不for下面的append不生效。。而且item是node，需要转为字符串，取消注释item = str(item) 改完上面2步骤后可以了，正常采集有帮助麻烦点个采纳【本回
xadmin 数据添加报错: IndexError: list index out of range
2019-04-07 17:59

weixin_34417635的博客后端具体报错定位报错分析点击这里报错解决源码 input_html = [ht for ht in super(AdminSplitDateTime, self).render(name, value, attrs).split('\n') if ht != ''] 修改...
IndexError: list index out of range list python
2021-07-14 16:45

回答 1 已采纳 data1_out[i+1] 这个地方有问题，i的取值超过这个列表长度了啊
python爬取中出现IndexError: list index out of range的错误 python
2021-03-20 21:22

回答 1 已采纳只能证明一个问题，就是re.findall(findLink,item)这个for循环中，肯定至少有一个取到的是空列表[]
显示 IndexError: list index out of range 但是我一直搞不懂在哪里出了问题下面是代码 python
2022-05-25 22:01

回答 1 已采纳主要是你取出的值哪儿没分割对，导致取出的值有换行，下面是我改了一下的代码 student1 = [] headline = [] def load_info(): fr = open("C:
Django2集成xadmin详解-4-list index out of range报错追踪和处理
2018-01-25 23:12

yuhan963的博客在创建Model的时候，如果存在类型是DateTimeField的字段，则在xadmin后端管理界面里，对该Model进行添加操作的时候，会报list index out of range。这是上篇文章创建的Model： class IDC(models.Model): name ...
python中的out of loop_Python：在/ for循环中列出索引超出范围(Python: List index out of range inside while/for loop)...
2020-11-26 00:36

weixin_39777497的博客 Python：在/ for循环中列出索引超出范围(Python: List index out of range inside while/for loop)我想做一个Tribonacci序列。（每个新项目都是该列表中前三项的总和。）但是每当我使用while / for循环时，它都会...
python django博客_django,python_django博客：list index out of range，django,python,编程 - phpStudy...
2021-03-17 01:15

曦曦早安的博客 django博客：list index out of range用django做博客，原本是可以用的，博客是项目下的一个app。后来我把博客升级为项目，不再做为为项目下一个app，做了一些修改。出现了如下错误。所有页面都是类似错误。...
ERROR: Failed building wheel for mayavi
2022-02-27 13:30

流浪德意志的博客 conda虚拟环境中安装mayavi 提示错误： Numpy is required to build Mayavi correctly, please install it first. ERROR: Failed building wheel for mayavi ************************************************...
Python（三）~22.02.02
2022-02-02 21:54

林序linxu的博客修改指定位置的数据 name_list[1] = "李四" # IndexError: list assignment index out of range # name_list[3] = "赵六" #error # 4.向列表中增加数据 # append方法向列表的末尾追加数据 name_list.append("小明") ...
061_Apex 异常捕捉
2018-09-11 16:09

weixin_33754913的博客错误信息的显示适用于前端和后端：如果该记录是从用户页面修改的，则用户会看到错误信息如果该记录是从 Apex 程序中被插入或修改，则错误信息会被记录在日志中 trigger OppyMaxAmountTrigger on ...
Python列表：全面指南
2024-04-29 07:06

南风以南的博客在Python编程中，列表（List）是最常用的数据结构之一，它是一个有序的集合，可以容纳任意类型的对象，如数字、字符串甚至其他列表。列表的特点是可变性，这意味着你可以添加、删除或修改列表中的元素。这使得列表...
〖Python零基础入门篇㉓〗- 列表的索引与切片
2022-03-01 07:00

哈哥撩编程的博客 Neo print(names[-1]) # >>> Adem print(names[5]) # IndexError: list index out of range 没有索引为5的元素，所以报错 ⭐️ 什么是切片？索引用来对单个成员(元素)进行访问，切片则是对一定范围内的成员(元素...
pp-human在rk3588上部署
2023-03-01 09:33

Kun Li的博客 strategy_and_data_recovery&depth_1-utm_source=distribute.pc_relevant.none-task-blog-2~default~CTRLIST~Rate-5-128250096-blog-126153227.pc_relevant_3mothn_strategy_and_data_recovery&utm_relevant_index=9...
没有解决我的问题, 去提问

问题事件

关注

码龄粉丝数原力等级 --

被采纳

被点赞

采纳率
修改了问题 12月19日
关注

码龄粉丝数原力等级 --

被采纳

被点赞

采纳率
创建了问题 12月19日

悬赏问题

¥15 2024电赛H题指导
¥15 第三方如何控制E8a进行烧录
¥15 关于lua调用DLL的c/c++动态库（相关搜索：数据库）
¥15 openwrt结合智能家居（相关搜索：路由器）
¥15 求一款免费的pdf编辑js，web端用的
¥15 求分析下图晶体与三极管组成的振荡电路
¥100 多线程+连接池+代理运行一段时间线程阻塞
¥15 关于#单片机#的问题：求一个使用C语言将重力加速度gx,gy,gz积分获取到速度的代码（相关搜索：c语言）
¥15 matlab导致电脑重启问题
¥20 为何R语言love图显示的分类变量点与smd值不一致

IndexError: list index out of range

3条回答 默认 最新

问题事件

悬赏问题

3条回答默认最新