gaojun
/
gxy_ner


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163
							from spacy_pkuseg import pkuseg
from pathlib import Path

from gensim import corpora, models
from gensim.models.coherencemodel import CoherenceModel

import data_preparation
import bio
import time
import scipy
import numpy as np
import jieba

building_dict_file_name = 'lda/building_dict.txt'

def save_user_dict():
    building_dict = bio.building_dict()
    Path(building_dict_file_name).write_text('\n'.join(building_dict))


def cut(text):
    file_name = str(Path(building_dict_file_name).absolute())
    seg = pkuseg(user_dict=file_name)
    seg_list = seg.cut(text)
    return seg_list


def process():
    file_name = str(Path(building_dict_file_name).absolute())
    seg = pkuseg(user_dict=file_name)
    stop_words = get_stop_words()

    def preprocess_text_with_ner(patent_ner):
        entities = []
        entity_texts = set()  # 用set去重
        patent_entities = [ent for categ_ent in patent_ner['结果'].values() for ent in categ_ent]
        for ent in patent_entities:
            # 清理实体文本，去除内部多余空格，并转小写（可选）
            clean_ent = ent.strip().lower()
            if clean_ent and clean_ent not in stop_words and len(clean_ent) > 1:  # 过滤掉空实体、停用词实体和单字实体
                entities.append(clean_ent)
                entity_texts.add(clean_ent)  # 记录已被识别为实体的文本

        # 对非实体部分进行分词 (可选策略)
        # 策略 A: 只使用实体 (如果实体覆盖度足够)
        # processed_tokens = entities

        # 策略 B: 结合分词，但避免重复切分实体
        # (这个策略较复杂，这里采用一个简化版：对全文分词，然后替换回实体，再过滤)
        # 使用jieba分词
        text = patent_ner['摘要']
        # words = seg.cut(text)
        words = []

        processed_tokens = []
        # 先加入识别出的实体
        processed_tokens.extend(entities)

        # 再处理分词结果，过滤停用词、单字词，并确保不是实体的一部分
        current_text = text.lower()  # 用于检查是否是实体的一部分
        for word in words:
            word = word.strip().lower()
            # 检查这个词是否已经是实体的一部分，或者是否是停用词/单字词
            is_part_of_entity = False
            for ent_text in entity_texts:
                if word in ent_text:  # 简化判断，可能不够精确
                    is_part_of_entity = True
                    break

            if word and word not in stop_words and len(word) > 1 and not is_part_of_entity:
                processed_tokens.append(word)

        # 去重 (如果需要)
        processed_tokens = list(dict.fromkeys(processed_tokens))

        return processed_tokens

    ner_result = data_preparation.read_ner_result()
    processed_docs = [preprocess_text_with_ner(patent_ner) for patent_ner in ner_result]


    print("\n开始构建LDA模型...")
    start_time = time.time()

    # 创建Gensim字典
    id2word = corpora.Dictionary(processed_docs)
    print(f"创建字典完成，字典大小: {len(id2word)}")

    # 创建语料库 (BoW格式)
    corpus = [id2word.doc2bow(doc) for doc in processed_docs]
    print("创建语料库 (BoW) 完成。")

    # 确定主题数量 (这是一个超参数，通常需要尝试不同的值)
    # 可以通过计算困惑度(Perplexity)或一致性分数(Coherence Score)来辅助选择
    num_topics = 30  # 假设我们想挖掘3个主题 (根据你的数据量和领域知识调整)
    print(f"设置主题数量为: {num_topics}")

    # import multiprocessing
    # cpu_count = multiprocessing.cpu_count()

    # 训练LDA模型 (使用多核版本 LdaMulticore 加速)
    # passes 控制训练遍数，iterations 控制每次迭代的最大次数
    # alpha 和 eta 是先验参数，'auto' 让gensim自动学习
    # random_state 保证结果可复现
    try:
        # lda_model = models.LdaMulticore(
        #     corpus=corpus,
        #     id2word=id2word,
        #     num_topics=num_topics,
        #     random_state=42,
        #     chunksize=100,  # 每次处理的文档数
        #     passes=15,  # 整个语料库的训练遍数
        #     iterations=100,  # 对每个文档的迭代次数
        #     alpha='auto',  # 或者设置为一个浮点数 e.g., 0.1
        #     eta='auto',  # 或者设置为一个浮点数 e.g., 0.01
        #     workers=max(1, cpu_count - 1)  # 使用CPU核心数-1
        # )
        lda_model = models.LdaModel(
            corpus=corpus,
            id2word=id2word,
            num_topics=num_topics,
            random_state=42,
            chunksize=100,  # 每次处理的文档数
            passes=15,  # 整个语料库的训练遍数
            iterations=100,  # 对每个文档的迭代次数
            alpha='auto',  # 或者设置为一个浮点数 e.g., 0.1
            eta='auto',  # 或者设置为一个浮点数 e.g., 0.01
        )
        print("LDA模型训练成功。")

        print("\n--- LDA 主题结果 ---")
        # 打印每个主题的代表性词语 (实体优先)
        # num_words 控制每个主题显示多少个词
        topics = lda_model.print_topics(num_topics=num_topics, num_words=10)
        for i, topic in enumerate(topics):
            print(f"主题 {i + 1}: {topic[1]}")  # topic[1] 是主题词字符串

        # (可选) 计算模型的一致性分数 (Coherence Score)
        # C_v 一致性是比较常用和推荐的指标
        print("\n计算模型一致性分数 (C_v)...")
        coherence_model_lda = CoherenceModel(model=lda_model, texts=processed_docs, dictionary=id2word, coherence='c_v')
        coherence_lda = coherence_model_lda.get_coherence()
        print(f'LDA 模型一致性分数 (C_v): {coherence_lda:.4f}')
        # 一致性分数越高，通常表示主题的可解释性越好 (范围一般在0.3到0.7之间较好)

        # (可选) 查看某篇专利的主题分布
        # print("\n查看第一篇专利的主题分布:")
        # doc_lda = lda_model[corpus[0]] # 获取第一篇文档的主题分布
        # print(doc_lda) # 输出格式为 [(topic_id, probability), ...]

        end_time = time.time()
        print(f"LDA模型训练耗时: {end_time - start_time:.2f} 秒。")
    except Exception as e:
        print(f"LDA模型训练或评估过程中发生错误: {e}")
        print("可能的原因：数据量过少、文本预处理后内容为空、参数设置问题等。")

if __name__ == '__main__':
    # save_user_dict()
    txt = '''
本发明提供一种陶瓷坯体及其成型方法,所述成型方法包括下述步骤：步骤1、配料：称取陶瓷粉料,量取溶剂,并制备环氧体系粉末,以100重量份的陶瓷粉料为基准,所述环氧体系粉末的含量为1-5重量份,所述溶剂的含量为50-100重量份；步骤2、球磨：将上述陶瓷粉料、环氧体系粉末、溶剂进行球磨,得到浆料；步骤3、成型：将上述浆料注入模具中,低温脱除溶剂；然后在模具上方进行加压,同时对模具进行加热,冷却后得到坯件。本发明还涉及采用所述陶瓷坯体制作的陶瓷产品。本发明通过在体系中加入溶剂,能够有效降低粘结剂在整个体系中的含量比,所制作的陶瓷坯件具有较高的硬度、较低的收缩率,并且适用于于大尺寸的陶瓷产品的制备。    
    '''
    # cut(txt)
    process()