gaojun
/
gxy_ner


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209
							import re
from collections import defaultdict
from datetime import datetime

import yaml
import jieba
import jieba.posseg as pseg
import polars as pl

from pathlib import Path
from data_preparation import demo_file_name, columns, read_ner_result
from deepseek import ds_ner
import pandas as pd

import bio
from scel2text import get_words_from_sogou_cell_dict

building_dict_file_name_sogo = '~/Documents/论文/建筑词汇大全【官方推荐】.scel'
bio_file_name = '~/Documents/论文/上市公司-建筑工业化-专利摘要数据-样本-BIO.csv'
entity_types = {
    "结构部件类": 'COM',
    "材料类": 'MAT',
    "材料": 'MAT',
    "技术参数类": 'PAR',
    "技术特征类": 'FEA',
    "制造工艺类": 'MAC',
    "功能属性类": 'FUN',
    "规范标准类": 'REG',
    "专利法律实体类": 'PAT',
    "性能指标类": 'PER',
    "特殊构造类": 'SPE',
}
categs = ['结构部件类', '材料类', '技术特征类', '制造工艺类', '特殊构造类']


class FlowList(list):
    ...

# 自定义处理 hobbies 字段为流格式
def custom_hobbies_representer(dumper, data):
    return dumper.represent_sequence('tag:yaml.org,2002:seq', data, flow_style=True)

# 注册自定义 representer
yaml.add_representer(FlowList, custom_hobbies_representer)

# "CN200910091292.3"
all_words = defaultdict(list)

def process(row):
    jieba.load_userdict("word_dict.txt")
    bio = yaml.safe_load(open('bio.yml'))

    description = row[3]
    words = pseg.cut(description)
    origin_words = []
    none_words = []
    bio_words = []
    for value in bio.values():
        if value:
            bio_words += value
    for word, flag in words:
        origin_words.append(word)
        if word not in bio_words:
            none_words.append(word)
    return origin_words, none_words


def ner_demo_by_deepseek():
    patents = []
    df = pl.read_csv(str(Path(demo_file_name).expanduser()), columns=columns, encoding="utf-8")
    df = df.sort('专利申请号')
    results = []
    count = 1
    for row in df.iter_rows():
        try:
            result = ds_ner(row[1], row[3])
            # 去除deepseek返回的yaml格式
            result = result.replace('```yaml\n', '').replace('```', '')
            result = yaml.safe_load(result)
            for t in result['结果']:
                result['结果'][t] = FlowList(result['结果'][t])
            results.append(result)
            patents.append(row[1])
            Path('ds_patents_result.yml').write_text(yaml.dump(results, allow_unicode=True), encoding='utf-8')
            Path('ds_patents.yml').write_text(yaml.dump(patents), encoding='utf-8')
            print(datetime.now(), row[1], count)
            count += 1
        except Exception as e:
            continue

    print('All ok')


def resave_ds_patents_result():
    ner_result = read_ner_result()
    for ns in ner_result:
        for t in ns['结果']:
            ns['结果'][t] = FlowList(ns['结果'][t])
    result_str = yaml.dump(ner_result, allow_unicode=True)
    Path('ds_patents_result_resave.yml').write_text(result_str, encoding='utf-8')


def annotate_bio(text, ner_words):
    '''
    1. 结构部件类
    2. 材料类
    3. 技术参数类
    4. 技术特征类
    5. 制造工艺类
    6. 功能属性类
    7. 规范标准类
    8. 专利法律实体类
    9. 性能指标类
    10. 特殊构造类
    '''
    tokens = list(text)  # 将文本按字拆分
    labels = ["O"] * len(tokens)
    not_found = []

    # 标注
    for entity_name, entity_word in ner_words.items():
        if entity_name not in categs:
            continue
        entity_type = entity_types[entity_name]
        for entity in entity_word:
            entity = re.escape(entity)
            matchs = list(re.finditer(entity, text))
            if not matchs:
                not_found.append(entity)
            for match in matchs:
                start, end = match.start(), match.end()
                labels[start] = f"B-{entity_type}"
                for i in range(start + 1, end):
                    labels[i] = f"I-{entity_type}"
    return list(zip(tokens, labels)), not_found


def add_bio():
    ner_results = read_ner_result()
    for ner_word in ner_results:
        bio_result, not_found = annotate_bio(ner_word['摘要'], ner_word['结果'])
        ner_word['bio'] = str(bio_result)
        ner_word['ner'] = ner_word.pop('结果')
        if not_found:
            ner_word['not_found'] = FlowList(not_found)
            print(ner_word['专利号'], not_found)

    bio_file = str(Path(bio_file_name).expanduser())

    # for ns in ner_results:
    #     for t in ns['ner']:
    #         ns['ner'][t] = FlowList(ns['ner'][t])
    # result_str = yaml.dump(ner_results, allow_unicode=True)
    # Path(bio_file_name).expanduser().write_text(result_str, encoding='utf-8')

    df = pd.DataFrame.from_dict(ner_results)  # 转换为 DataFrame
    df.to_csv(bio_file, index=False, encoding="utf-8")
    print('All done.')


def get_bio():
    df = pd.read_csv(bio_file_name, encoding='utf-8')

    labels = []
    for label in df['bio']:
        label = eval(label)
        labels.append([l[1] for l in label])

    return df['摘要'].tolist(), labels


def check_entity_name():
    ner_result = read_ner_result()
    err_patent = {}
    for ner_word in ner_result:
        text = ner_word['摘要']
        patent = ner_word['专利号']
        non_include = []
        for lables in ner_word['结果'].values():
            for label in lables:
                if label not in text:
                    non_include.append(label)
        if non_include:
            err_patent[patent] = non_include
    Path('non_include.yml').write_text(yaml.dump(err_patent, allow_unicode=True), encoding='utf-8')


def building_dict():
    # 搜狗建筑词汇库
    records = get_words_from_sogou_cell_dict(Path(building_dict_file_name_sogo).expanduser())
    build_words = [r[1] for r in records]

    ner_result = read_ner_result()
    words = []
    for data in ner_result:
        for entity_name, entity_words in data['结果'].items():
            if entity_name not in categs:
                continue
            words.extend(entity_words)
    return set(build_words) | set(words)


if __name__ == '__main__':
    # building_dict()
    # ner_demo_by_deepseek()
    # resave_ds_patents_result()
    # add_bio()
    # check_entity_name()
    building_dict()