import requests
from bs4 import BeautifulSoup
import time
import sqlite3
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36 Edg/128.0.0.0"
}
def getPositionInfo(detail_url):
res = requests.get(detail_url, headers=headers)
if res.status_code == 200:
html = res.text
soup = BeautifulSoup(html, "lxml")
job = soup.find(class_="new_job_name")
if job:
job = job.text.strip()
else:
job = "N/A"
academic = soup.find(class_="job_academic")
if academic:
academic = academic.text
else:
academic = "N/A"
position = soup.find(class_="job_position")
if position:
position = position.text
else:
position = "N/A"
salary = soup.find(class_="job_money cutom_font")
if salary:
salary = salary.text
else:
salary = "N/A"
insert_into_db(job, academic, position, salary)
else:
print(f"请求失败,状态码:{res.status_code}")
def insert_into_db(job, academic, position, salary):
conn = sqlite3.connect('internship_data.db')
cursor = conn.cursor()
# 创建表,如果表不存在
cursor.execute('''
CREATE TABLE IF NOT EXISTS positions (
id INTEGER PRIMARY KEY AUTOINCREMENT,
job TEXT,
academic TEXT,
position TEXT,
salary TEXT
)
''')
# 插入数据
cursor.execute('''
INSERT INTO positions (job, academic, position, salary)
VALUES (?,?,?,?)
''', (job, academic, position, salary))
conn.commit()
cursor.close()
conn.close()
def main():
for i in range(1, 16):
url = f"https://www.shixiseng.com/interns?page={i}&type=intern&keyword=%E7%AE%97%E6%B3%95%E5%AE%9E%E4%B9%A0%E7%94%9F&area=&months=&days=°ree=&official=entry&enterprise=&salary=-0&publishTime=&sortType=&city=%E5%85%A8%E5%9B%BD&internExtend="
res = requests.get(url, headers=headers)
if res.status_code == 200:
html = res.text
soup = BeautifulSoup(html, "lxml")
titles = soup.find_all(class_="title ellipsis font")
for item in titles:
detail_url = item.attrs["href"]
getPositionInfo(detail_url)
else:
print(f"请求失败,状态码:{res.status_code}")
time.sleep(2)
if __name__ == "__main__":
main()
from pyecharts.charts import Line, Pie
import sqlite3
import pandas as pd
from pyecharts import options as opts
def fetch_data_from_db():
conn = sqlite3.connect('internship_data.db')
query = """
SELECT position AS city, salary
FROM positions
"""
df = pd.read_sql_query(query, conn)
conn.close()
return df
def process_data(df):
cityDict = {}
city_num_dict = {}
for index, row in df.iterrows():
city = row['city']
salary = row['salary']
if "薪资面议" in salary:
continue
daily = salary.split("/")[0]
daily_list = daily.split("-")
if len(daily_list) == 1:
start = daily_list[0]
end = daily_list[0]
elif len(daily_list) == 2:
start = daily_list[0]
end = daily_list[1]
try:
average = (int(start) + int(end)) / 2
except ValueError:
continue
if city not in cityDict:
cityDict[city] = []
cityDict[city].append(average)
for city, values in cityDict.items():
average_value = sum(values) // len(values)
cityDict[city] = average_value
city_num_dict[city] = len(values)
return cityDict, city_num_dict
def visualize_data(cityDict, city_num_dict):
# 可视化工资平均值
line_salary = Line()
line_salary.add_xaxis(list(cityDict.keys()))
line_salary.add_yaxis(
series_name="工资平均值",
y_axis=list(cityDict.values()),
label_opts=opts.LabelOpts(is_show=False), # 不显示标签
markpoint_opts=opts.MarkPointOpts(
data=[
opts.MarkPointItem(type_="max", name="最大值"),
opts.MarkPointItem(type_="min", name="最小值")
]
),
markline_opts=opts.MarkLineOpts(
data=[
opts.MarkLineItem(type_="average", name="平均值")
]
)
)
line_salary.set_global_opts(
title_opts=opts.TitleOpts(title="不同城市工资平均值", subtitle="实习岗位"),
tooltip_opts=opts.TooltipOpts(trigger="axis", axis_pointer_type="cross"),
xaxis_opts=opts.AxisOpts(
axislabel_opts=opts.LabelOpts(rotate=45), # x轴标签旋转45度
name="城市",
name_location="middle",
name_gap=30
),
yaxis_opts=opts.AxisOpts(
name="工资平均值",
name_location="middle",
name_gap=30
),
toolbox_opts=opts.ToolboxOpts(is_show=True), # 显示工具箱
legend_opts=opts.LegendOpts(is_show=True)
)
line_salary.render("salary_line.html")
# 可视化职位数量
line_positions = Line()
line_positions.add_xaxis(list(city_num_dict.keys()))
line_positions.add_yaxis(
series_name="职位数量",
y_axis=list(city_num_dict.values()),
label_opts=opts.LabelOpts(is_show=False),
markpoint_opts=opts.MarkPointOpts(
data=[
opts.MarkPointItem(type_="max", name="最大值"),
opts.MarkPointItem(type_="min", name="最小值")
]
),
markline_opts=opts.MarkLineOpts(
data=[
opts.MarkLineItem(type_="average", name="平均值")
]
)
)
line_positions.set_global_opts(
title_opts=opts.TitleOpts(title="不同城市职位数量", subtitle="实习岗位"),
tooltip_opts=opts.TooltipOpts(trigger="axis", axis_pointer_type="cross"),
xaxis_opts=opts.AxisOpts(
axislabel_opts=opts.LabelOpts(rotate=45),
name="城市",
name_location="middle",
name_gap=30
),
yaxis_opts=opts.AxisOpts(
name="职位数量",
name_location="middle",
name_gap=30
),
toolbox_opts=opts.ToolboxOpts(is_show=True),
legend_opts=opts.LegendOpts(is_show=True)
)
line_positions.render("positions_line.html")
def visualize_pie(city_num_dict):
# 可视化职位数量占比的饼图
pie = Pie()
data_pair = [(city, num) for city, num in city_num_dict.items()]
pie.add(
series_name="不同城市职位数量占比",
data_pair=data_pair,
label_opts=opts.LabelOpts(formatter="{b}: {d}%")
)
pie.set_global_opts(
title_opts=opts.TitleOpts(title="不同城市职位数量占比", subtitle="实习岗位"),
legend_opts=opts.LegendOpts(orient="vertical", pos_left="left", pos_top="20%")
)
pie.render("positions_pie.html")
if __name__ == "__main__":
df = fetch_data_from_db()
cityDict, city_num_dict = process_data(df)
visualize_data(cityDict, city_num_dict)
visualize_pie(city_num_dict)
三个图旁边都加一个超链接跳转到代码中的网站