python 运行了最后一行命令,shell没有退出,但是后台还能看到这个进程,内存占用还很高,可能是什么原因啊
代码如下
#crawler.py
import datetime
import gc
import sys
from decimal import Decimal
# 设置当前工作目录为项目根目录
sys.path.append(".")
from config import DB_108_analyze_support_data
from memory_profiler import profile
from sqlalchemy.orm import sessionmaker
from table._session import get_engine
from table.analyze_support_data.tb_boc_interest_rate import TbBocInterestRate
from util.dom_reader import DomReader
def main():
session = None
try:
url = "https://www.bochk.com/whk/rates/hkDollarPrimeRate/hkDollarPrimeRate-enquiry.action?lang=hk"
dom_reader = DomReader.from_url(url)
xpath = "//tr[@class='best-rate']/td[2]/text()"
interest_rate_str = dom_reader.dom.xpath(xpath)[0].strip()
interest_rate = Decimal(interest_rate_str.replace("%", ""))
engine = get_engine(db_config_dict=DB_108_analyze_support_data)
Session = sessionmaker(bind=engine)
session = Session()
rate_entry = TbBocInterestRate(
interest_rate=interest_rate,
currency_type="HKD",
create_time=datetime.datetime.now(),
update_time=datetime.datetime.now(),
)
session.add(rate_entry)
session.commit()
print("成功保存港元最优惠利率:", interest_rate)
except Exception as e:
print(f"发生错误: {e}")
finally:
if session is not None:
session.close()
if dom_reader is not None:
del dom_reader
if engine is not None:
del engine, Session
gc.collect()
print("done")
if __name__ == "__main__":
main()
# dom_reader.py
import re
import string
from typing import Generic, TypeVar
from curl_cffi import requests
from fake_useragent import UserAgent
from lxml.etree import HTML, _Element, _ElementUnicodeResult
from retrying import retry
from util.common import generate_str
from util.decorator import UnprocessException
from util.proxy import Proxy
T = TypeVar("T")
class PageNotFoundException(Exception):
pass
class DomReader:
proxy = Proxy()
def __init__(self, html: str):
self.html = html
self.dom = self.get_dom(html)
def str_first(self, xpath: str, type_: T = str) -> T:
data_list: list[_ElementUnicodeResult] = self.dom.xpath(xpath)
data = None
if isinstance(data_list, list):
assert len(data_list) < 2, data_list
for x in data_list:
assert isinstance(x, _ElementUnicodeResult) or isinstance(x, str)
data = data_list[0] if data_list else None
elif isinstance(data_list, _ElementUnicodeResult) or isinstance(data_list, str):
data = data_list
else:
raise Exception("unknow data type", data_list)
return data and type_(data)
def str_many_filter(
self, xpath: str, need_first: bool = False, nullable: bool = False, null_value: any = None
) -> str:
data_list_e: list[_ElementUnicodeResult] = self.dom.xpath(xpath)
data_list: list[str] = list(map(str, data_list_e))
data_list: list[str] = list(filter(lambda x: x.replace(" ", "").replace("\n", ""), data_list))
if len(data_list) == 0:
if nullable:
return None
else:
raise Exception(f"data is empty by {xpath=}")
if not need_first:
data_list = data_list[1:]
assert len(data_list) > 0
# res = ""
# for x in data_list:
# if x.
res = "\n".join(data_list)
res = re.sub(r"\n[\s\n]*", r"\n", res)
return res
@classmethod
def from_url(cls, url: str):
html = cls.get_html(url)
return cls.from_html(html)
@classmethod
def from_html(cls, html: str):
return cls(html)
@staticmethod
def get_dom(html) -> _Element:
dom = HTML(html)
return dom
@classmethod
# @retry(
# stop_max_attempt_number=10,
# retry_on_exception=lambda exc: not isinstance(exc, UnprocessException),
# )
def get_html(cls, url: str) -> str:
headers = {
"User-Agent": UserAgent(browsers=["chrome"], os=["windows"]).random,
# "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36",
}
cookies = {
# "session-id": "136-6455282-5340211",
"session-id": cls.get_session_id(),
}
proxy = cls.proxy.url()
resp_data = requests.get(url, headers=headers, cookies=cookies, proxy=proxy)
# resp_data = requests.get(self.url, headers=headers, cookies=cookies)
text = resp_data.text
text = text.lstrip("\n") # 去除开头的换行符 例如'\n\n\n\n\n<!DOCTYPE html PUBLIC'
# print(resp_data.url)
if resp_data.url.endswith("errors/404.html"):
return text
# raise UnprocessException(f"page not found, get 404, {url=}")
# print(text[:10])
if not (text.startswith("<!doctype") or text.startswith("<!DOCTYPE")):
raise Exception(text)
return text
@staticmethod
def get_session_id():
return (
generate_str(3, 3, string.digits)
+ "-"
+ generate_str(7, 7, string.digits)
+ "-"
+ generate_str(7, 7, string.digits)
)