nicevoice
铁心兰
2021-05-09 21:00
采纳率: 66.7%
浏览 32

下面这段mongodb文档如何用python实现查重?

[{
  "_id": "886132ea6b0947c19faa5bf650d77326",
  "next_run_time": 1620552240,
  "job_state": BinData(0,"gASVyAEAAAAAAAB9lCiMCGV4ZWN1dG9ylIwHZGVmYXVsdJSMB3ZlcnNpb26USwGMEm1pc2ZpcmVfZ3JhY2VfdGltZZRLHowEYXJnc5R9lCiMAnRvlIwLV2FuZ01pbmdMZWmUjANtc2eUjB3nnIvnnIvnnIsgMjAyMS0wNS0wOSAxNzoyNDo1MpR1hZSMDW1heF9pbnN0YW5jZXOUSwOMBmt3YXJnc5R9lIwNbmV4dF9ydW5fdGltZZSMCGRhdGV0aW1llIwIZGF0ZXRpbWWUk5RDCgflBQkRGAAAAACUjARweXR6lIwCX3CUk5QojA1Bc2lhL1NoYW5naGFplE2AcEsAjANDU1SUdJRSlIaUUpSMCGNvYWxlc2NllImMBG5hbWWUjAdzZW5kbXNnlIwHdHJpZ2dlcpSMGWFwc2NoZWR1bGVyLnRyaWdnZXJzLmRhdGWUjAtEYXRlVHJpZ2dlcpSTlCmBlH2UKIwIcnVuX2RhdGWUaBJDCgflBQkRDjQGImeUaBqGlFKUaANLAXVijARmdW5jlIwQX19tYWluX186c2VuZG1zZ5SMAmlklIwgODg2MTMyZWE2YjA5NDdjMTlmYWE1YmY2NTBkNzczMjaUdS4=")
},{
  "_id": "b6ab5d47024d4bc990bf7a70dcac6941",
  "next_run_time": 1620552240,
  "job_state": BinData(0,"gASVyAEAAAAAAAB9lCiMCGV4ZWN1dG9ylIwHZGVmYXVsdJSMB3ZlcnNpb26USwGMEm1pc2ZpcmVfZ3JhY2VfdGltZZRLHowEYXJnc5R9lCiMAnRvlIwLV2FuZ01pbmdMZWmUjANtc2eUjB3nnIvnnIvnnIsgMjAyMS0wNS0wOSAxNzoyNDo1NpR1hZSMDW1heF9pbnN0YW5jZXOUSwOMBmt3YXJnc5R9lIwNbmV4dF9ydW5fdGltZZSMCGRhdGV0aW1llIwIZGF0ZXRpbWWUk5RDCgflBQkRGAAAAACUjARweXR6lIwCX3CUk5QojA1Bc2lhL1NoYW5naGFplE2AcEsAjANDU1SUdJRSlIaUUpSMCGNvYWxlc2NllImMBG5hbWWUjAdzZW5kbXNnlIwHdHJpZ2dlcpSMGWFwc2NoZWR1bGVyLnRyaWdnZXJzLmRhdGWUjAtEYXRlVHJpZ2dlcpSTlCmBlH2UKIwIcnVuX2RhdGWUaBJDCgflBQkRDjgAMmeUaBqGlFKUaANLAXVijARmdW5jlIwQX19tYWluX186c2VuZG1zZ5SMAmlklIwgYjZhYjVkNDcwMjRkNGJjOTkwYmY3YTcwZGNhYzY5NDGUdS4=")
},{
  "_id": "4df49cc1bcd442509e0bcb5da92c114f",
  "next_run_time": 1620552240,
  "job_state": BinData(0,"gASVyAEAAAAAAAB9lCiMCGV4ZWN1dG9ylIwHZGVmYXVsdJSMB3ZlcnNpb26USwGMEm1pc2ZpcmVfZ3JhY2VfdGltZZRLHowEYXJnc5R9lCiMAnRvlIwLV2FuZ01pbmdMZWmUjANtc2eUjB3nnIvnnIvnnIsgMjAyMS0wNS0wOSAxNzoyNDo1N5R1hZSMDW1heF9pbnN0YW5jZXOUSwOMBmt3YXJnc5R9lIwNbmV4dF9ydW5fdGltZZSMCGRhdGV0aW1llIwIZGF0ZXRpbWWUk5RDCgflBQkRGAAAAACUjARweXR6lIwCX3CUk5QojA1Bc2lhL1NoYW5naGFplE2AcEsAjANDU1SUdJRSlIaUUpSMCGNvYWxlc2NllImMBG5hbWWUjAdzZW5kbXNnlIwHdHJpZ2dlcpSMGWFwc2NoZWR1bGVyLnRyaWdnZXJzLmRhdGWUjAtEYXRlVHJpZ2dlcpSTlCmBlH2UKIwIcnVuX2RhdGWUaBJDCgflBQkRDjkDYZKUaBqGlFKUaANLAXVijARmdW5jlIwQX19tYWluX186c2VuZG1zZ5SMAmlklIwgNGRmNDljYzFiY2Q0NDI1MDllMGJjYjVkYTkyYzExNGaUdS4=")
},{
  "_id": "f8d2e3afc18d4543a9738ee90ca3e8ef",
  "next_run_time": 1620552240,
  "job_state": BinData(0,"gASVyAEAAAAAAAB9lCiMCGV4ZWN1dG9ylIwHZGVmYXVsdJSMB3ZlcnNpb26USwGMEm1pc2ZpcmVfZ3JhY2VfdGltZZRLHowEYXJnc5R9lCiMAnRvlIwLV2FuZ01pbmdMZWmUjANtc2eUjB3nnIvnnIvnnIsgMjAyMS0wNS0wOSAxNzoyNDo1OJR1hZSMDW1heF9pbnN0YW5jZXOUSwOMBmt3YXJnc5R9lIwNbmV4dF9ydW5fdGltZZSMCGRhdGV0aW1llIwIZGF0ZXRpbWWUk5RDCgflBQkRGAAAAACUjARweXR6lIwCX3CUk5QojA1Bc2lhL1NoYW5naGFplE2AcEsAjANDU1SUdJRSlIaUUpSMCGNvYWxlc2NllImMBG5hbWWUjAdzZW5kbXNnlIwHdHJpZ2dlcpSMGWFwc2NoZWR1bGVyLnRyaWdnZXJzLmRhdGWUjAtEYXRlVHJpZ2dlcpSTlCmBlH2UKIwIcnVuX2RhdGWUaBJDCgflBQkRDjoGRj6UaBqGlFKUaANLAXVijARmdW5jlIwQX19tYWluX186c2VuZG1zZ5SMAmlklIwgZjhkMmUzYWZjMThkNDU0M2E5NzM4ZWU5MGNhM2U4ZWaUdS4=")
},{
  "_id": "ae164506a1e34157950b58bbe27c1b67",
  "next_run_time": 1620552240,
  "job_state": BinData(0,"gASVyAEAAAAAAAB9lCiMCGV4ZWN1dG9ylIwHZGVmYXVsdJSMB3ZlcnNpb26USwGMEm1pc2ZpcmVfZ3JhY2VfdGltZZRLHowEYXJnc5R9lCiMAnRvlIwLV2FuZ01pbmdMZWmUjANtc2eUjB3nnIvnnIvnnIsgMjAyMS0wNS0wOSAxNzoyNDo1OZR1hZSMDW1heF9pbnN0YW5jZXOUSwOMBmt3YXJnc5R9lIwNbmV4dF9ydW5fdGltZZSMCGRhdGV0aW1llIwIZGF0ZXRpbWWUk5RDCgflBQkRGAAAAACUjARweXR6lIwCX3CUk5QojA1Bc2lhL1NoYW5naGFplE2AcEsAjANDU1SUdJRSlIaUUpSMCGNvYWxlc2NllImMBG5hbWWUjAdzZW5kbXNnlIwHdHJpZ2dlcpSMGWFwc2NoZWR1bGVyLnRyaWdnZXJzLmRhdGWUjAtEYXRlVHJpZ2dlcpSTlCmBlH2UKIwIcnVuX2RhdGWUaBJDCgflBQkRDjsJ1byUaBqGlFKUaANLAXVijARmdW5jlIwQX19tYWluX186c2VuZG1zZ5SMAmlklIwgYWUxNjQ1MDZhMWUzNDE1Nzk1MGI1OGJiZTI3YzFiNjeUdS4=")
},{
  "_id": "0cd92a166f07456c840462a4da5c9df3",
  "next_run_time": 1620552240,
  "job_state": BinData(0,"gASVyAEAAAAAAAB9lCiMCGV4ZWN1dG9ylIwHZGVmYXVsdJSMB3ZlcnNpb26USwGMEm1pc2ZpcmVfZ3JhY2VfdGltZZRLHowEYXJnc5R9lCiMAnRvlIwLV2FuZ01pbmdMZWmUjANtc2eUjB3nnIvnnIvnnIsgMjAyMS0wNS0wOSAxNzoyNDo1NJR1hZSMDW1heF9pbnN0YW5jZXOUSwOMBmt3YXJnc5R9lIwNbmV4dF9ydW5fdGltZZSMCGRhdGV0aW1llIwIZGF0ZXRpbWWUk5RDCgflBQkRGAAAAACUjARweXR6lIwCX3CUk5QojA1Bc2lhL1NoYW5naGFplE2AcEsAjANDU1SUdJRSlIaUUpSMCGNvYWxlc2NllImMBG5hbWWUjAdzZW5kbXNnlIwHdHJpZ2dlcpSMGWFwc2NoZWR1bGVyLnRyaWdnZXJzLmRhdGWUjAtEYXRlVHJpZ2dlcpSTlCmBlH2UKIwIcnVuX2RhdGWUaBJDCgflBQkRDjYG0SyUaBqGlFKUaANLAXVijARmdW5jlIwQX19tYWluX186c2VuZG1zZ5SMAmlklIwgMGNkOTJhMTY2ZjA3NDU2Yzg0MDQ2MmE0ZGE1YzlkZjOUdS4=")
}]

job_state重复很多,所以想删除冗余只留一条,但因为是BinData格式所以不会搞了,求大神指点,如何python实现

  • 点赞
  • 写回答
  • 关注问题
  • 收藏
  • 邀请回答

6条回答 默认 最新

  • technologist_32
    CSDN专家-Time 2021-05-09 21:02
    已采纳

    对 job字段进行查重对吗

    import numpy as np
    np.unique(data.job_state)

    点赞 评论
  • technologist_32
    CSDN专家-Time 2021-05-09 21:09

    实际上是这样写 用mongodb去做去重

    点赞 评论
  • nicevoice
    铁心兰 2021-05-09 21:05

    是的

    点赞 评论
  • nicevoice
    铁心兰 2021-05-10 07:48
    var bulk = db.jobs.initializeOrderedBulkOp(),
        count = 0;
    db.jobs.aggregate([
        { "$group": {
            "_id": {  "job_state" : "new $job_state" }, //查询bindata需要在这里new一下,好深的坑,浪费了一整天
            "ids": { "$push": "$_id" },
            "count": { "$sum": 1 }
        }},
        { "$match": { "count": { "$gt": 1 } } }
    ],{ "allowDiskUse": true}).forEach(function(doc) {
        doc.ids.shift(); 
        bulk.find({ "_id": { "$in": doc.ids } }).remove(); 
        count++;
    
        if ( count % 1000 == 0 ) {
           bulk.execute();
           bulk = db.jobs.initializeOrderedBulkOp();
        }
    });
    
    if ( count % 1000 != 0 ) bulk.execute();

    问题已经解决,分享下帮大家快速出坑

    点赞 评论
  • QA_Assistant
    有问必答小助手 2021-05-10 10:49

    您好,我是有问必答小助手,您的问题已经有小伙伴解答了,您看下是否解决,可以追评进行沟通哦~

    如果有您比较满意的答案 / 帮您提供解决思路的答案,可以点击【采纳】按钮,给回答的小伙伴一些鼓励哦~~

    ps:问答VIP仅需29元,即可享受5次/月 有问必答服务,了解详情>>>https://vip.csdn.net/askvip?utm_source=1146287632

    点赞 评论
  • QA_Assistant
    有问必答小助手 2021-05-11 15:38

    非常感谢您使用有问必答服务,为了后续更快速的帮您解决问题,现诚邀您参与有问必答体验反馈。您的建议将会运用到我们的产品优化中,希望能得到您的支持与协助!

    速戳参与调研>>>https://t.csdnimg.cn/Kf0y

    点赞 评论

相关推荐