import re from collections import defaultdict from datetime import datetime import yaml import jieba import jieba.posseg as pseg import polars as pl from pathlib import Path from data_preparation import demo_file_name, columns from deepseek import ds_ner import pandas as pd bio_file_name = '~/Documents/论文/上市公司-建筑工业化-专利摘要数据-样本-BIO.csv' class FlowList(list): ... # 自定义处理 hobbies 字段为流格式 def custom_hobbies_representer(dumper, data): return dumper.represent_sequence('tag:yaml.org,2002:seq', data, flow_style=True) # 注册自定义 representer yaml.add_representer(FlowList, custom_hobbies_representer) # "CN200910091292.3" all_words = defaultdict(list) def process(row): jieba.load_userdict("word_dict.txt") bio = yaml.safe_load(open('bio.yml')) description = row[3] words = pseg.cut(description) origin_words = [] none_words = [] bio_words = [] for value in bio.values(): if value: bio_words += value for word, flag in words: origin_words.append(word) if word not in bio_words: none_words.append(word) return origin_words, none_words def ner_demo_by_deepseek(): patents = [] df = pl.read_csv(str(Path(demo_file_name).expanduser()), columns=columns, encoding="utf-8") df = df.sort('专利申请号') results = [] count = 1 for row in df.iter_rows(): try: result = ds_ner(row[1], row[3]) # 去除deepseek返回的yaml格式 result = result.replace('```yaml\n', '').replace('```', '') result = yaml.safe_load(result) results.append(result) patents.append(row[1]) Path('ds_patents_result.yml').write_text(yaml.safe_dump(results, allow_unicode=True), encoding='utf-8') Path('ds_patents.yml').write_text(yaml.safe_dump(patents), encoding='utf-8') print(datetime.now(), row[1], count) count += 1 except Exception as e: continue print('All ok') def read_ner_result(): yaml_str = Path('ds_patents_result.yml').read_text(encoding='utf-8') ner_result = yaml.safe_load(yaml_str) return ner_result def resave_ds_patents_result(): ner_result = read_ner_result() for ns in ner_result: for t in ns['结果']: ns['结果'][t] = FlowList(ns['结果'][t]) result_str = yaml.dump(ner_result, allow_unicode=True) Path('ds_patents_result_resave.yml').write_text(result_str, encoding='utf-8') def annotate_bio(text, ner_words): tokens = list(text) # 将文本按字拆分 labels = ["O"] * len(tokens) not_found = [] # 标注材料(MAT) for mat in ner_words: mat = re.escape(mat) matchs = list(re.finditer(mat, text)) if not matchs: not_found.append(mat) for match in matchs: start, end = match.start(), match.end() labels[start] = "B-MAT" for i in range(start + 1, end): labels[i] = "I-MAT" return list(zip(tokens, labels)), not_found def add_bio(): ner_results = read_ner_result() for ner_word in ner_results: all_word = [] for ner_result in ner_word['结果'].values(): all_word.extend(ner_result) all_word = set(all_word) bio_result, not_found = annotate_bio(ner_word['摘要'], all_word) ner_word['bio'] = str(bio_result) ner_word['ner'] = ner_word.pop('结果') if not_found: ner_word['not_found'] = FlowList(not_found) print(ner_word['专利号'], not_found) bio_file = str(Path(bio_file_name).expanduser()) # for ns in ner_results: # for t in ns['ner']: # ns['ner'][t] = FlowList(ns['ner'][t]) # result_str = yaml.dump(ner_results, allow_unicode=True) # Path(bio_file_name).expanduser().write_text(result_str, encoding='utf-8') df = pd.DataFrame.from_dict(ner_results) # 转换为 DataFrame df.to_csv(bio_file, index=False, encoding="utf-8") print('All done.') def get_bio(): df = pd.read_csv(bio_file_name, encoding='utf-8') labels = [] for label in df['bio']: label = eval(label) labels.append([l[1] for l in label]) return df['摘要'].tolist(), labels if __name__ == '__main__': # patents = yaml.safe_load(open('ds_patents.yml')) or [] none_words = ['马克', '数据网', '微信', '公众号', '马 克 数 据 网', '百度搜索'] # ner_demo_by_deepseek() # resave_ds_patents_result() add_bio()