OllerGrande 2024-12-31 12:30 采纳率: 14.3%
浏览 11

爬虫+数据库+可视化


import requests
from bs4 import BeautifulSoup
import time
import sqlite3


headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36 Edg/128.0.0.0"
}


def getPositionInfo(detail_url):
    res = requests.get(detail_url, headers=headers)
    if res.status_code == 200:
        html = res.text
        soup = BeautifulSoup(html, "lxml")
        job = soup.find(class_="new_job_name")
        if job:
            job = job.text.strip()
        else:
            job = "N/A"
        academic = soup.find(class_="job_academic")
        if academic:
            academic = academic.text
        else:
            academic = "N/A"
        position = soup.find(class_="job_position")
        if position:
            position = position.text
        else:
            position = "N/A"
        salary = soup.find(class_="job_money cutom_font")
        if salary:
            salary = salary.text
        else:
            salary = "N/A"
        insert_into_db(job, academic, position, salary)
    else:
        print(f"请求失败,状态码:{res.status_code}")


def insert_into_db(job, academic, position, salary):
    conn = sqlite3.connect('internship_data.db')
    cursor = conn.cursor()
    # 创建表,如果表不存在
    cursor.execute('''
        CREATE TABLE IF NOT EXISTS positions (
            id INTEGER PRIMARY KEY AUTOINCREMENT,
            job TEXT,
            academic TEXT,
            position TEXT,
            salary TEXT
        )
    ''')
    # 插入数据
    cursor.execute('''
        INSERT INTO positions (job, academic, position, salary)
        VALUES (?,?,?,?)
    ''', (job, academic, position, salary))
    conn.commit()
    cursor.close()
    conn.close()


def main():
    for i in range(1, 16):
        url = f"https://www.shixiseng.com/interns?page={i}&type=intern&keyword=%E7%AE%97%E6%B3%95%E5%AE%9E%E4%B9%A0%E7%94%9F&area=&months=&days=&degree=&official=entry&enterprise=&salary=-0&publishTime=&sortType=&city=%E5%85%A8%E5%9B%BD&internExtend="
        res = requests.get(url, headers=headers)
        if res.status_code == 200:
            html = res.text
            soup = BeautifulSoup(html, "lxml")
            titles = soup.find_all(class_="title ellipsis font")
            for item in titles:
                detail_url = item.attrs["href"]
                getPositionInfo(detail_url)
        else:
            print(f"请求失败,状态码:{res.status_code}")
        time.sleep(2)


if __name__ == "__main__":
    main()



from pyecharts.charts import Line, Pie
import sqlite3
import pandas as pd
from pyecharts import options as opts


def fetch_data_from_db():
    conn = sqlite3.connect('internship_data.db')
    query = """
    SELECT position AS city, salary
    FROM positions
    """
    df = pd.read_sql_query(query, conn)
    conn.close()
    return df


def process_data(df):
    cityDict = {}
    city_num_dict = {}
    for index, row in df.iterrows():
        city = row['city']
        salary = row['salary']
        if "薪资面议" in salary:
            continue
        daily = salary.split("/")[0]
        daily_list = daily.split("-")
        if len(daily_list) == 1:
            start = daily_list[0]
            end = daily_list[0]
        elif len(daily_list) == 2:
            start = daily_list[0]
            end = daily_list[1]
        try:
            average = (int(start) + int(end)) / 2
        except ValueError:
            continue
        if city not in cityDict:
            cityDict[city] = []
        cityDict[city].append(average)
    for city, values in cityDict.items():
        average_value = sum(values) // len(values)
        cityDict[city] = average_value
        city_num_dict[city] = len(values)
    return cityDict, city_num_dict


def visualize_data(cityDict, city_num_dict):
    # 可视化工资平均值
    line_salary = Line()
    line_salary.add_xaxis(list(cityDict.keys()))
    line_salary.add_yaxis(
        series_name="工资平均值",
        y_axis=list(cityDict.values()),
        label_opts=opts.LabelOpts(is_show=False),  # 不显示标签
        markpoint_opts=opts.MarkPointOpts(
            data=[
                opts.MarkPointItem(type_="max", name="最大值"),
                opts.MarkPointItem(type_="min", name="最小值")
            ]
        ),
        markline_opts=opts.MarkLineOpts(
            data=[
                opts.MarkLineItem(type_="average", name="平均值")
            ]
        )
    )
    line_salary.set_global_opts(
        title_opts=opts.TitleOpts(title="不同城市工资平均值", subtitle="实习岗位"),
        tooltip_opts=opts.TooltipOpts(trigger="axis", axis_pointer_type="cross"),
        xaxis_opts=opts.AxisOpts(
            axislabel_opts=opts.LabelOpts(rotate=45),  # x轴标签旋转45度
            name="城市",
            name_location="middle",
            name_gap=30
        ),
        yaxis_opts=opts.AxisOpts(
            name="工资平均值",
            name_location="middle",
            name_gap=30
        ),
        toolbox_opts=opts.ToolboxOpts(is_show=True),  # 显示工具箱
        legend_opts=opts.LegendOpts(is_show=True)
    )
    line_salary.render("salary_line.html")

    # 可视化职位数量
    line_positions = Line()
    line_positions.add_xaxis(list(city_num_dict.keys()))
    line_positions.add_yaxis(
        series_name="职位数量",
        y_axis=list(city_num_dict.values()),
        label_opts=opts.LabelOpts(is_show=False),
        markpoint_opts=opts.MarkPointOpts(
            data=[
                opts.MarkPointItem(type_="max", name="最大值"),
                opts.MarkPointItem(type_="min", name="最小值")
            ]
        ),
        markline_opts=opts.MarkLineOpts(
            data=[
                opts.MarkLineItem(type_="average", name="平均值")
        ]
        )
    )
    line_positions.set_global_opts(
        title_opts=opts.TitleOpts(title="不同城市职位数量", subtitle="实习岗位"),
        tooltip_opts=opts.TooltipOpts(trigger="axis", axis_pointer_type="cross"),
        xaxis_opts=opts.AxisOpts(
            axislabel_opts=opts.LabelOpts(rotate=45),
            name="城市",
            name_location="middle",
            name_gap=30
        ),
        yaxis_opts=opts.AxisOpts(
            name="职位数量",
            name_location="middle",
            name_gap=30
        ),
        toolbox_opts=opts.ToolboxOpts(is_show=True),
        legend_opts=opts.LegendOpts(is_show=True)
    )
    line_positions.render("positions_line.html")


def visualize_pie(city_num_dict):
    # 可视化职位数量占比的饼图
    pie = Pie()
    data_pair = [(city, num) for city, num in city_num_dict.items()]
    pie.add(
        series_name="不同城市职位数量占比",
        data_pair=data_pair,
        label_opts=opts.LabelOpts(formatter="{b}: {d}%")
    )
    pie.set_global_opts(
        title_opts=opts.TitleOpts(title="不同城市职位数量占比", subtitle="实习岗位"),
        legend_opts=opts.LegendOpts(orient="vertical", pos_left="left", pos_top="20%")
    )
    pie.render("positions_pie.html")


if __name__ == "__main__":
    df = fetch_data_from_db()
    cityDict, city_num_dict = process_data(df)
    visualize_data(cityDict, city_num_dict)
    visualize_pie(city_num_dict)

三个图旁边都加一个超链接跳转到代码中的网站

  • 写回答

1条回答 默认 最新

  • 码农阿豪@新空间 新星创作者: 前端开发技术领域 2024-12-31 12:35
    关注
    让阿豪来帮你解答,本回答参考chatgpt3.5编写提供,如果还有疑问可以评论或留言
    要求回答格式规范,有条理,并给出案例和代码。 问题:结合给出的代码,说明如何使用Python爬取数据并存储到SQLite数据库,然后从数据库中获取数据进行处理并可视化展示。 回答:
    1. 爬取数据并存储到SQLite数据库的步骤:
      • 创建一个函数getPositionInfo(detail_url),用于爬取指定页面的信息,并将信息存储到数据库中。
      • 使用BeautifulSoup从网页中提取所需信息如职位名称、学历要求、职位描述和薪资等。
      • 将提取的信息插入到SQLite数据库中的positions表中,使用insert_into_db(job, academic, position, salary)函数进行数据库操作。 示例代码:
    # 示例代码略,根据提供的函数实现数据爬取和存储到数据库的功能
    
    1. 从数据库中获取数据并进行处理的步骤:
      • 创建一个函数fetch_data_from_db(),用于从SQLite数据库中获取数据并返回DataFrame对象。
      • 创建一个函数process_data(df),用于处理获取的数据,例如统计各城市的平均薪资等。 示例代码:
    # 从数据库中获取数据并返回DataFrame对象
    df = fetch_data_from_db()
    # 处理获取的数据,统计各城市的平均薪资
    city_dict = {}
    for index, row in df.iterrows():
        city = row['city']
        salary = row['salary']
        if city in city_dict:
            city_dict[city].append(salary)
        else:
            city_dict[city] = [salary]
            
    # 对数据进行可视化展示,这里以使用pyecharts库绘制柱状图或饼图为例
    from pyecharts.charts import Bar
    bar = (
        Bar()
        .add_xaxis(list(city_dict.keys()))
        .add_yaxis("平均薪资", [sum(salaries) / len(salaries) for salaries in city_dict.values()])
        .set_global_opts(title_opts=opts.TitleOpts(title="各城市实习生平均薪资"))
    )
    bar.render("average_salary_by_city.html")
    

    通过以上步骤,可以完整地实现从数据爬取、存储到数据库,到数据获取、处理和可视化展示的流程。

    评论

报告相同问题?

问题事件

  • 创建了问题 12月31日