import re from collections import defaultdict from datetime import datetime import yaml import jieba import jieba.posseg as pseg import polars as pl from pathlib import Path from data_preparation import demo_file_name, columns, read_ner_result from deepseek import ds_ner import pandas as pd import bio from scel2text import get_words_from_sogou_cell_dict building_dict_file_name_sogo = '~/Documents/论文/建筑词汇大全【官方推荐】.scel' bio_file_name = '~/Documents/论文/上市公司-建筑工业化-专利摘要数据-样本-BIO.csv' entity_types = { "结构部件类": 'COM', "材料类": 'MAT', "材料": 'MAT', "技术参数类": 'PAR', "技术特征类": 'FEA', "制造工艺类": 'MAC', "功能属性类": 'FUN', "规范标准类": 'REG', "专利法律实体类": 'PAT', "性能指标类": 'PER', "特殊构造类": 'SPE', } categs = ['结构部件类', '材料类', '技术特征类', '制造工艺类', '特殊构造类'] class FlowList(list): ... # 自定义处理 hobbies 字段为流格式 def custom_hobbies_representer(dumper, data): return dumper.represent_sequence('tag:yaml.org,2002:seq', data, flow_style=True) # 注册自定义 representer yaml.add_representer(FlowList, custom_hobbies_representer) # "CN200910091292.3" all_words = defaultdict(list) def process(row): jieba.load_userdict("word_dict.txt") bio = yaml.safe_load(open('bio.yml')) description = row[3] words = pseg.cut(description) origin_words = [] none_words = [] bio_words = [] for value in bio.values(): if value: bio_words += value for word, flag in words: origin_words.append(word) if word not in bio_words: none_words.append(word) return origin_words, none_words def ner_demo_by_deepseek(): patents = [] df = pl.read_csv(str(Path(demo_file_name).expanduser()), columns=columns, encoding="utf-8") df = df.sort('专利申请号') results = [] count = 1 for row in df.iter_rows(): try: result = ds_ner(row[1], row[3]) # 去除deepseek返回的yaml格式 result = result.replace('```yaml\n', '').replace('```', '') result = yaml.safe_load(result) for t in result['结果']: result['结果'][t] = FlowList(result['结果'][t]) results.append(result) patents.append(row[1]) Path('ds_patents_result.yml').write_text(yaml.dump(results, allow_unicode=True), encoding='utf-8') Path('ds_patents.yml').write_text(yaml.dump(patents), encoding='utf-8') print(datetime.now(), row[1], count) count += 1 except Exception as e: continue print('All ok') def resave_ds_patents_result(): ner_result = read_ner_result() for ns in ner_result: for t in ns['结果']: ns['结果'][t] = FlowList(ns['结果'][t]) result_str = yaml.dump(ner_result, allow_unicode=True) Path('ds_patents_result_resave.yml').write_text(result_str, encoding='utf-8') def annotate_bio(text, ner_words): ''' 1. 结构部件类 2. 材料类 3. 技术参数类 4. 技术特征类 5. 制造工艺类 6. 功能属性类 7. 规范标准类 8. 专利法律实体类 9. 性能指标类 10. 特殊构造类 ''' tokens = list(text) # 将文本按字拆分 labels = ["O"] * len(tokens) not_found = [] # 标注 for entity_name, entity_word in ner_words.items(): if entity_name not in categs: continue entity_type = entity_types[entity_name] for entity in entity_word: entity = re.escape(entity) matchs = list(re.finditer(entity, text)) if not matchs: not_found.append(entity) for match in matchs: start, end = match.start(), match.end() labels[start] = f"B-{entity_type}" for i in range(start + 1, end): labels[i] = f"I-{entity_type}" return list(zip(tokens, labels)), not_found def add_bio(): ner_results = read_ner_result() for ner_word in ner_results: bio_result, not_found = annotate_bio(ner_word['摘要'], ner_word['结果']) ner_word['bio'] = str(bio_result) ner_word['ner'] = ner_word.pop('结果') if not_found: ner_word['not_found'] = FlowList(not_found) print(ner_word['专利号'], not_found) bio_file = str(Path(bio_file_name).expanduser()) # for ns in ner_results: # for t in ns['ner']: # ns['ner'][t] = FlowList(ns['ner'][t]) # result_str = yaml.dump(ner_results, allow_unicode=True) # Path(bio_file_name).expanduser().write_text(result_str, encoding='utf-8') df = pd.DataFrame.from_dict(ner_results) # 转换为 DataFrame df.to_csv(bio_file, index=False, encoding="utf-8") print('All done.') def get_bio(): df = pd.read_csv(bio_file_name, encoding='utf-8') labels = [] for label in df['bio']: label = eval(label) labels.append([l[1] for l in label]) return df['摘要'].tolist(), labels def check_entity_name(): ner_result = read_ner_result() err_patent = {} for ner_word in ner_result: text = ner_word['摘要'] patent = ner_word['专利号'] non_include = [] for lables in ner_word['结果'].values(): for label in lables: if label not in text: non_include.append(label) if non_include: err_patent[patent] = non_include Path('non_include.yml').write_text(yaml.dump(err_patent, allow_unicode=True), encoding='utf-8') def building_dict(): # 搜狗建筑词汇库 records = get_words_from_sogou_cell_dict(Path(building_dict_file_name_sogo).expanduser()) build_words = [r[1] for r in records] ner_result = read_ner_result() words = [] for data in ner_result: for entity_name, entity_words in data['结果'].items(): if entity_name not in categs: continue words.extend(entity_words) return set(build_words) | set(words) if __name__ == '__main__': # building_dict() # ner_demo_by_deepseek() # resave_ds_patents_result() # add_bio() # check_entity_name() building_dict()