Python实现小说人物关系输出(完整+修正)

Python实现小说人物关系输出(完整+修正),第1张

这篇博客提供能够输出小说人物关系的完整python代码。

代码的原作者是Qingyu Mao,十分感谢!!

以下是Mao的github链接和博客教程链接:
@source: https://github.com/maoqyhz/TextCharactervVisualization
@tutorial: https://www.cnblogs.com/Sinte-Beuve/p/7679392.html

获得小说中人物关系数据和对关系进行可视化的具体理论思路、 *** 作方法、数据准备请参考Qingyu Mao的博客,ta已经讲得非常清楚详细了!

Mao的代码可能存在输出只有表头的空csv的问题,我对代码稍作修改后得到了正常输出。我也添加了两个简单的checkpoint,供有需要的朋友检查代码不能运行的原因。修正后的所有代码如下:

"""
Created on 2017/10/15 19:24
Modified on 2022/05/03 14:06
@author: Qingyu Mao
@source: https://github.com/maoqyhz/TextCharactervVisualization
@tutorial: https://www.cnblogs.com/Sinte-Beuve/p/7679392.html
@co-author: 农魔饼饼
"""

import jieba
import codecs
from collections import Counter
from collections import defaultdict
from __future__ import print_function

TEXT_PATH = './Desktop/text.txt'  # 小说全文路径
DICT_PATH = './Desktop/dict.txt'  # 人物字典路径
SYNONYMOUS_DICT_PATH = './Desktop/synonymous_dict.txt'  # 同义词路径
SAVE_NODE_PATH = './Desktop/node.csv'
SAVE_EDGE_PATH = './edge.csv'
 

'''
person_counter是计数器,用来统计人物出现的次数。{'a':1,'b':2}
person_per_paragraph每段文字中出现的人物[['a','b'],[]]
relationships保存的是人物间的关系。key为人物A,value为字典,包含人物B和权值。
'''

person_counter = defaultdict(int) # 人物出场次数计数器
person_per_paragraph = []
relationships = {}
synonymous_dict = {}

class RelationshipView:
    def __init__(self, text_path, dict_path, synonymous_dict_path):
        self._text_path = text_path
        self._dict_path = dict_path
        self._synonymous_dict_path = synonymous_dict_path
        '''
        person_counter是一个计数器,用来统计人物出现的次数。{'a':1,'b':2}
        person_per_paragraph每段文字中出现的人物[['a','b'],[]]
        relationships保存的是人物间的关系。key为人物A,value为字典,包含人物B和权值。
        '''
        self._person_counter = defaultdict(int)
        self._person_per_paragraph = []
        self._relationships = {}
        self._synonymous_dict = {}

    def generate(self):
        self.count_person()
        self.calc_relationship()
        self.save_node_and_edge()

    def synonymous_names(self):
        '''
        获取同义名字典
        :return:
        '''
        with codecs.open(self._synonymous_dict_path, 'r', 'utf-8') as f:
            lines = f.read().split('\r\n')
        for l in lines:
            self._synonymous_dict[l.split(' ')[0]] = l.split(' ')[1]
        return self._synonymous_dict

    def get_clean_paragraphs(self):
        '''
        以段为单位分割全文
        :return:
        '''
        with codecs.open(self._text_path, 'r', 'utf-8') as f:
            paragraphs = f.read().split('\r\n\r\n')
            print(paragraphs[1:10]) #这一句是checkpoint,如果没问题,run之后应该会输出分割好的前十段文本
        return paragraphs

    def count_person(self):
        '''
        统计人物出场次数,添加每段的人物
        :return:
        '''
        paragraphs = self.get_clean_paragraphs()
        synonymous = self.synonymous_names()
        print('start process node')
        with codecs.open(self._dict_path, 'r', 'utf-8') as f:
            name_list = f.read().split('\r\n')  # 获取干净的name_list
            print(name_list[1:10]) #这一句是checkpoint,如果没问题,run之后应该会输出分割好的前十个人名
        for p in paragraphs:
            jieba.load_userdict(self._dict_path)
            # 分词,为每一段初始化新字典
            poss = jieba.cut(p)
            self._person_per_paragraph.append([])
            for w in poss:
                # 判断是否在姓名字典以及同义词区分
                if w not in name_list:
                    continue
                if synonymous.get(w):
                    w = synonymous[w]
                # 往每段中添加人物
                self._person_per_paragraph[-1].append(w)
                # 初始化人物关系,计数
                if self._person_counter.get(w) is None:
                    self._relationships[w] = {}
                self._person_counter[w] += 1
        return self._person_counter

    def calc_relationship(self):
        '''
        统计人物关系权值
        :return:
        '''
        print("start to process edge")
        for p in self._person_per_paragraph:
            for name1 in p:
                for name2 in p:
                    if name1 == name2:
                        continue
                    if self._relationships[name1].get(name2) is None:
                        self._relationships[name1][name2] = 1
                    else:
                        self._relationships[name1][name2] += 1
        return self._relationships

    def save_node_and_edge(self):
        '''
        根据dephi格式保存为csv
        :return:
        '''
        with codecs.open(SAVE_NODE_PATH, "a+", "utf-8") as f:
            f.write("Id,Label,Weight\r\n")
            for name, times in self._person_counter.items():
                f.write(name + "," + name + "," + str(times) + "\r\n")

        with codecs.open(SAVE_EDGE_PATH, "a+", "utf-8") as f:
            f.write("Source,Target,Weight\r\n")
            for name, edges in self._relationships.items():
                for v, w in edges.items():
                    if w > 3:
                        f.write(name + "," + v + "," + str(w) + "\r\n")
        print('save file successful!')


if __name__ == '__main__':
    v = RelationshipView(TEXT_PATH, DICT_PATH, SYNONYMOUS_DICT_PATH)
    v.generate()

欢迎分享,转载请注明来源:内存溢出

原文地址: http://outofmemory.cn/langs/867217.html

(0)
打赏 微信扫一扫 微信扫一扫 支付宝扫一扫 支付宝扫一扫
上一篇 2022-05-12
下一篇 2022-05-12

发表评论

登录后才能评论

评论列表(0条)

保存