qq_22839957 2021-04-20 17:52 采纳率: 50%
浏览 37

python文本分析,目前是sorted降序排列,想得到一个乱序排列的折线图,怎么改成乱序排列呢?

import jieba.posseg as pseg
import operator
import warnings
import os
from tqdm import tqdm
import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
warnings.filterwarnings('ignore')


class Word():
    def __init__(self, char, freq=0, deg=0):
        self.freq = freq
        self.deg = deg
        self.char = char

    def returnScore(self):
        return self.deg / self.freq

    def updateOccur(self, phraseLength):
        self.freq += 1
        self.deg += phraseLength

    def getChar(self):
        return self.char

    def updateFreq(self):
        self.freq += 1

    def getFreq(self):
        return self.freq


def notNumStr(instr):
    for item in instr:
        if '\u0041' <= item <= '\u005a' or ('\u0061' <= item <= '\u007a') or item.isdigit():
            return False
    return True


def run(rawText):
    swLibList = [line.rstrip('\n') for line in open('./dataset/1893(utf8).txt', 'r', encoding='utf-8')]
    conjLibList = [line.rstrip('\n') for line in open('./dataset/spw.txt', 'r', encoding='GB2312')]

    rawtextList = pseg.cut(rawText)

    textList = []
    listofSingleWord = dict()
    lastWord = ''
    poSPrty = ['m', 'x', 'uj', 'ul', 'mq', 'u', 'v', 'f']
    meaningfulCount = 0
    checklist = []
    for eachWord, flag in rawtextList:
        checklist.append([eachWord, flag])
        if eachWord in conjLibList or not notNumStr(
                eachWord) or eachWord in swLibList or flag in poSPrty or eachWord == '\n':
            if lastWord != '|':
                textList.append("|")
                lastWord = "|"
        elif eachWord not in swLibList and eachWord != '\n':
            textList.append(eachWord)
            meaningfulCount += 1
            if eachWord not in listofSingleWord:
                listofSingleWord[eachWord] = Word(eachWord)
            lastWord = ''

    newList = []
    tempList = []
    for everyWord in textList:
        if everyWord != '|':
            tempList.append(everyWord)
        else:
            newList.append(tempList)
            tempList = []

    tempStr = ''
    for everyWord in textList:
        if everyWord != '|':
            tempStr += everyWord + '|'
        else:
            if tempStr[:-1] not in listofSingleWord:
                listofSingleWord[tempStr[:-1]] = Word(tempStr[:-1])
                tempStr = ''

    for everyPhrase in newList:
        res = ''
        for everyWord in everyPhrase:
            listofSingleWord[everyWord].updateOccur(len(everyPhrase))
            res += everyWord + '|'
        phraseKey = res[:-1]
        if phraseKey not in listofSingleWord:
            listofSingleWord[phraseKey] = Word(phraseKey)
        else:
            listofSingleWord[phraseKey].updateFreq()

    outputList = dict()
    for everyPhrase in newList:

        if len(everyPhrase) > 5:
            continue
        score = 0
        phraseString = ''
        outStr = ''
        for everyWord in everyPhrase:
            score += listofSingleWord[everyWord].returnScore()
            phraseString += everyWord + '|'
            outStr += everyWord
        phraseKey = phraseString[:-1]
        freq = listofSingleWord[phraseKey].getFreq()
        if meaningfulCount != 0:
            if freq / meaningfulCount < 0.01 and freq < 3:
                continue
        else:
            pass
        outputList[outStr] = score

    sorted_list = sorted(outputList.items(), key=operator.itemgetter(1), reverse=True)
    return sorted_list[:20]


def plot(x, y, title):
    mpl.rcParams['font.sans-serif'] = ['STZhongsong']
    mpl.rcParams['axes.unicode_minus'] = False
    plt.plot(x, y)
    plt.xticks(rotation=-25)
    plt.xlabel('word')
    plt.ylabel('score')
    plt.title(title)
    plt.legend()
    plt.show()


def get_tags():
    path = './processed_data/'
    file_list = os.listdir(path)

    for file in tqdm(file_list):
        data2p = ''
        word = []
        score = []
        with open(path + file, 'r', encoding='utf-8') as fin:
            for lin in fin:
                line = lin.strip()
                data2p = data2p + line + ' '
        res = run(data2p)
        for r in res:
            word.append(r[0])
            score.append(r[1])
        # print(word)
        # print(score)
        plot(word, score, file)


if __name__ == '__main__':
    get_tags()
 

  • 写回答

2条回答 默认 最新

  • 关注

    随机抽样呢 , 抽样大小和原大小一样

    评论

报告相同问题?

悬赏问题

  • ¥15 matlab用simulink求解一个二阶微分方程
  • ¥30 matlab解优化问题代码
  • ¥15 写论文,需要数据支撑
  • ¥15 identifier of an instance of 类 was altered from xx to xx错误
  • ¥100 反编译微信小游戏求指导
  • ¥15 docker模式webrtc-streamer 无法播放公网rtsp
  • ¥15 学不会递归,理解不了汉诺塔参数变化
  • ¥15 基于图神经网络的COVID-19药物筛选研究
  • ¥30 软件自定义无线电该怎样使用
  • ¥15 R语言mediation包做中介分析,直接效应和间接效应都很小,为什么?