De_arning 2024-08-27 17:00 采纳率: 53.8%
浏览 5

python 运行了最后一行命令,shell没有退出,但是后台还能看到这个进程,内存占用还很高,可能是什么原因啊

python 运行了最后一行命令,shell没有退出,但是后台还能看到这个进程,内存占用还很高,可能是什么原因啊

img

img

代码如下

#crawler.py
import datetime
import gc
import sys
from decimal import Decimal

# 设置当前工作目录为项目根目录
sys.path.append(".")
from config import DB_108_analyze_support_data
from memory_profiler import profile
from sqlalchemy.orm import sessionmaker
from table._session import get_engine
from table.analyze_support_data.tb_boc_interest_rate import TbBocInterestRate
from util.dom_reader import DomReader


def main():

    session = None
    try:
        url = "https://www.bochk.com/whk/rates/hkDollarPrimeRate/hkDollarPrimeRate-enquiry.action?lang=hk"
        dom_reader = DomReader.from_url(url)

        xpath = "//tr[@class='best-rate']/td[2]/text()"
        interest_rate_str = dom_reader.dom.xpath(xpath)[0].strip()

        interest_rate = Decimal(interest_rate_str.replace("%", ""))

        engine = get_engine(db_config_dict=DB_108_analyze_support_data)
        Session = sessionmaker(bind=engine)
        session = Session()

        rate_entry = TbBocInterestRate(
            interest_rate=interest_rate,
            currency_type="HKD",
            create_time=datetime.datetime.now(),
            update_time=datetime.datetime.now(),
        )

        session.add(rate_entry)
        session.commit()

        print("成功保存港元最优惠利率:", interest_rate)
    except Exception as e:
        print(f"发生错误: {e}")
    finally:
        if session is not None:
            session.close()
        if dom_reader is not None:
            del dom_reader

        if engine is not None:
            del engine, Session

        gc.collect()
        print("done")


if __name__ == "__main__":
    main()
# dom_reader.py
import re
import string
from typing import Generic, TypeVar

from curl_cffi import requests
from fake_useragent import UserAgent
from lxml.etree import HTML, _Element, _ElementUnicodeResult
from retrying import retry
from util.common import generate_str
from util.decorator import UnprocessException
from util.proxy import Proxy

T = TypeVar("T")


class PageNotFoundException(Exception):
    pass


class DomReader:
    proxy = Proxy()

    def __init__(self, html: str):
        self.html = html
        self.dom = self.get_dom(html)

    def str_first(self, xpath: str, type_: T = str) -> T:
        data_list: list[_ElementUnicodeResult] = self.dom.xpath(xpath)

        data = None
        if isinstance(data_list, list):
            assert len(data_list) < 2, data_list
            for x in data_list:
                assert isinstance(x, _ElementUnicodeResult) or isinstance(x, str)
            data = data_list[0] if data_list else None
        elif isinstance(data_list, _ElementUnicodeResult) or isinstance(data_list, str):
            data = data_list
        else:
            raise Exception("unknow data type", data_list)

        return data and type_(data)

    def str_many_filter(
        self, xpath: str, need_first: bool = False, nullable: bool = False, null_value: any = None
    ) -> str:
        data_list_e: list[_ElementUnicodeResult] = self.dom.xpath(xpath)
        data_list: list[str] = list(map(str, data_list_e))
        data_list: list[str] = list(filter(lambda x: x.replace(" ", "").replace("\n", ""), data_list))

        if len(data_list) == 0:
            if nullable:
                return None
            else:
                raise Exception(f"data is empty by {xpath=}")

        if not need_first:
            data_list = data_list[1:]
            assert len(data_list) > 0

        # res = ""
        # for x in data_list:
        #     if x.
        res = "\n".join(data_list)
        res = re.sub(r"\n[\s\n]*", r"\n", res)
        return res

    @classmethod
    def from_url(cls, url: str):
        html = cls.get_html(url)
        return cls.from_html(html)

    @classmethod
    def from_html(cls, html: str):
        return cls(html)

    @staticmethod
    def get_dom(html) -> _Element:
        dom = HTML(html)
        return dom

    @classmethod
    # @retry(
    #     stop_max_attempt_number=10,
    #     retry_on_exception=lambda exc: not isinstance(exc, UnprocessException),
    # )
    def get_html(cls, url: str) -> str:
        headers = {
            "User-Agent": UserAgent(browsers=["chrome"], os=["windows"]).random,
            # "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36",
        }
        cookies = {
            # "session-id": "136-6455282-5340211",
            "session-id": cls.get_session_id(),
        }

        proxy = cls.proxy.url()

        resp_data = requests.get(url, headers=headers, cookies=cookies, proxy=proxy)
        # resp_data = requests.get(self.url, headers=headers, cookies=cookies)
        text = resp_data.text

        text = text.lstrip("\n")  # 去除开头的换行符 例如'\n\n\n\n\n<!DOCTYPE html PUBLIC'
        # print(resp_data.url)
        if resp_data.url.endswith("errors/404.html"):
            return text
            # raise UnprocessException(f"page not found, get 404, {url=}")

        # print(text[:10])
        if not (text.startswith("<!doctype") or text.startswith("<!DOCTYPE")):
            raise Exception(text)
        return text

    @staticmethod
    def get_session_id():
        return (
            generate_str(3, 3, string.digits)
            + "-"
            + generate_str(7, 7, string.digits)
            + "-"
            + generate_str(7, 7, string.digits)
        )
  • 写回答

1条回答 默认 最新

  • 吃不了席 2024-08-27 17:01
    关注

    以下回复参考:皆我百晓生券券喵儿等免费微信小程序作答:

    根据提供的信息无法生成答案。

    评论

报告相同问题?

问题事件

  • 修改了问题 8月27日
  • 创建了问题 8月27日

悬赏问题

  • ¥15 Coze智能助手搭建过程中的问题请教
  • ¥15 12864只亮屏 不显示汉字
  • ¥20 三极管1000倍放大电路
  • ¥15 vscode报错如何解决
  • ¥15 前端vue CryptoJS Aes CBC加密后端java解密
  • ¥15 python随机森林对两个excel表格读取,shap报错
  • ¥15 基于STM32心率血氧监测(OLED显示)相关代码运行成功后烧录成功OLED显示屏不显示的原因是什么
  • ¥100 X轴为分离变量(因子变量),如何控制X轴每个分类变量的长度。
  • ¥30 求给定范围的全体素数p的(p-2)/p的连乘积值
  • ¥15 VFP如何使用阿里TTS实现文字转语音?