| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152 |
- import re
- from collections import defaultdict
- from datetime import datetime
- import yaml
- import jieba
- import jieba.posseg as pseg
- import polars as pl
- from pathlib import Path
- from data_preparation import demo_file_name, columns
- from deepseek import ds_ner
- import pandas as pd
- bio_file_name = '~/Documents/论文/上市公司-建筑工业化-专利摘要数据-样本-BIO.csv'
- class FlowList(list):
- ...
- # 自定义处理 hobbies 字段为流格式
- def custom_hobbies_representer(dumper, data):
- return dumper.represent_sequence('tag:yaml.org,2002:seq', data, flow_style=True)
- # 注册自定义 representer
- yaml.add_representer(FlowList, custom_hobbies_representer)
- # "CN200910091292.3"
- all_words = defaultdict(list)
- def process(row):
- jieba.load_userdict("word_dict.txt")
- bio = yaml.safe_load(open('bio.yml'))
- description = row[3]
- words = pseg.cut(description)
- origin_words = []
- none_words = []
- bio_words = []
- for value in bio.values():
- if value:
- bio_words += value
- for word, flag in words:
- origin_words.append(word)
- if word not in bio_words:
- none_words.append(word)
- return origin_words, none_words
- def ner_demo_by_deepseek():
- patents = []
- df = pl.read_csv(str(Path(demo_file_name).expanduser()), columns=columns, encoding="utf-8")
- df = df.sort('专利申请号')
- results = []
- count = 1
- for row in df.iter_rows():
- try:
- result = ds_ner(row[1], row[3])
- # 去除deepseek返回的yaml格式
- result = result.replace('```yaml\n', '').replace('```', '')
- result = yaml.safe_load(result)
- results.append(result)
- patents.append(row[1])
- Path('ds_patents_result.yml').write_text(yaml.safe_dump(results, allow_unicode=True), encoding='utf-8')
- Path('ds_patents.yml').write_text(yaml.safe_dump(patents), encoding='utf-8')
- print(datetime.now(), row[1], count)
- count += 1
- except Exception as e:
- continue
- print('All ok')
- def read_ner_result():
- yaml_str = Path('ds_patents_result.yml').read_text(encoding='utf-8')
- ner_result = yaml.safe_load(yaml_str)
- return ner_result
- def resave_ds_patents_result():
- ner_result = read_ner_result()
- for ns in ner_result:
- for t in ns['结果']:
- ns['结果'][t] = FlowList(ns['结果'][t])
- result_str = yaml.dump(ner_result, allow_unicode=True)
- Path('ds_patents_result_resave.yml').write_text(result_str, encoding='utf-8')
- def annotate_bio(text, ner_words):
- tokens = list(text) # 将文本按字拆分
- labels = ["O"] * len(tokens)
- not_found = []
- # 标注材料(MAT)
- for mat in ner_words:
- mat = re.escape(mat)
- matchs = list(re.finditer(mat, text))
- if not matchs:
- not_found.append(mat)
- for match in matchs:
- start, end = match.start(), match.end()
- labels[start] = "B-MAT"
- for i in range(start + 1, end):
- labels[i] = "I-MAT"
- return list(zip(tokens, labels)), not_found
- def add_bio():
- ner_results = read_ner_result()
- for ner_word in ner_results:
- all_word = []
- for ner_result in ner_word['结果'].values():
- all_word.extend(ner_result)
- all_word = set(all_word)
- bio_result, not_found = annotate_bio(ner_word['摘要'], all_word)
- ner_word['bio'] = str(bio_result)
- ner_word['ner'] = ner_word.pop('结果')
- if not_found:
- ner_word['not_found'] = FlowList(not_found)
- print(ner_word['专利号'], not_found)
- bio_file = str(Path(bio_file_name).expanduser())
- # for ns in ner_results:
- # for t in ns['ner']:
- # ns['ner'][t] = FlowList(ns['ner'][t])
- # result_str = yaml.dump(ner_results, allow_unicode=True)
- # Path(bio_file_name).expanduser().write_text(result_str, encoding='utf-8')
- df = pd.DataFrame.from_dict(ner_results) # 转换为 DataFrame
- df.to_csv(bio_file, index=False, encoding="utf-8")
- print('All done.')
- def get_bio():
- df = pd.read_csv(bio_file_name, encoding='utf-8')
- labels = []
- for label in df['bio']:
- label = eval(label)
- labels.append([l[1] for l in label])
- return df['摘要'].tolist(), labels
- if __name__ == '__main__':
- # patents = yaml.safe_load(open('ds_patents.yml')) or []
- none_words = ['马克', '数据网', '微信', '公众号', '马 克 数 据 网', '百度搜索']
- # ner_demo_by_deepseek()
- # resave_ds_patents_result()
- add_bio()
|