bio.py 4.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152
  1. import re
  2. from collections import defaultdict
  3. from datetime import datetime
  4. import yaml
  5. import jieba
  6. import jieba.posseg as pseg
  7. import polars as pl
  8. from pathlib import Path
  9. from data_preparation import demo_file_name, columns
  10. from deepseek import ds_ner
  11. import pandas as pd
  12. bio_file_name = '~/Documents/论文/上市公司-建筑工业化-专利摘要数据-样本-BIO.csv'
  13. class FlowList(list):
  14. ...
  15. # 自定义处理 hobbies 字段为流格式
  16. def custom_hobbies_representer(dumper, data):
  17. return dumper.represent_sequence('tag:yaml.org,2002:seq', data, flow_style=True)
  18. # 注册自定义 representer
  19. yaml.add_representer(FlowList, custom_hobbies_representer)
  20. # "CN200910091292.3"
  21. all_words = defaultdict(list)
  22. def process(row):
  23. jieba.load_userdict("word_dict.txt")
  24. bio = yaml.safe_load(open('bio.yml'))
  25. description = row[3]
  26. words = pseg.cut(description)
  27. origin_words = []
  28. none_words = []
  29. bio_words = []
  30. for value in bio.values():
  31. if value:
  32. bio_words += value
  33. for word, flag in words:
  34. origin_words.append(word)
  35. if word not in bio_words:
  36. none_words.append(word)
  37. return origin_words, none_words
  38. def ner_demo_by_deepseek():
  39. patents = []
  40. df = pl.read_csv(str(Path(demo_file_name).expanduser()), columns=columns, encoding="utf-8")
  41. df = df.sort('专利申请号')
  42. results = []
  43. count = 1
  44. for row in df.iter_rows():
  45. try:
  46. result = ds_ner(row[1], row[3])
  47. # 去除deepseek返回的yaml格式
  48. result = result.replace('```yaml\n', '').replace('```', '')
  49. result = yaml.safe_load(result)
  50. results.append(result)
  51. patents.append(row[1])
  52. Path('ds_patents_result.yml').write_text(yaml.safe_dump(results, allow_unicode=True), encoding='utf-8')
  53. Path('ds_patents.yml').write_text(yaml.safe_dump(patents), encoding='utf-8')
  54. print(datetime.now(), row[1], count)
  55. count += 1
  56. except Exception as e:
  57. continue
  58. print('All ok')
  59. def read_ner_result():
  60. yaml_str = Path('ds_patents_result.yml').read_text(encoding='utf-8')
  61. ner_result = yaml.safe_load(yaml_str)
  62. return ner_result
  63. def resave_ds_patents_result():
  64. ner_result = read_ner_result()
  65. for ns in ner_result:
  66. for t in ns['结果']:
  67. ns['结果'][t] = FlowList(ns['结果'][t])
  68. result_str = yaml.dump(ner_result, allow_unicode=True)
  69. Path('ds_patents_result_resave.yml').write_text(result_str, encoding='utf-8')
  70. def annotate_bio(text, ner_words):
  71. tokens = list(text) # 将文本按字拆分
  72. labels = ["O"] * len(tokens)
  73. not_found = []
  74. # 标注材料(MAT)
  75. for mat in ner_words:
  76. mat = re.escape(mat)
  77. matchs = list(re.finditer(mat, text))
  78. if not matchs:
  79. not_found.append(mat)
  80. for match in matchs:
  81. start, end = match.start(), match.end()
  82. labels[start] = "B-MAT"
  83. for i in range(start + 1, end):
  84. labels[i] = "I-MAT"
  85. return list(zip(tokens, labels)), not_found
  86. def add_bio():
  87. ner_results = read_ner_result()
  88. for ner_word in ner_results:
  89. all_word = []
  90. for ner_result in ner_word['结果'].values():
  91. all_word.extend(ner_result)
  92. all_word = set(all_word)
  93. bio_result, not_found = annotate_bio(ner_word['摘要'], all_word)
  94. ner_word['bio'] = str(bio_result)
  95. ner_word['ner'] = ner_word.pop('结果')
  96. if not_found:
  97. ner_word['not_found'] = FlowList(not_found)
  98. print(ner_word['专利号'], not_found)
  99. bio_file = str(Path(bio_file_name).expanduser())
  100. # for ns in ner_results:
  101. # for t in ns['ner']:
  102. # ns['ner'][t] = FlowList(ns['ner'][t])
  103. # result_str = yaml.dump(ner_results, allow_unicode=True)
  104. # Path(bio_file_name).expanduser().write_text(result_str, encoding='utf-8')
  105. df = pd.DataFrame.from_dict(ner_results) # 转换为 DataFrame
  106. df.to_csv(bio_file, index=False, encoding="utf-8")
  107. print('All done.')
  108. def get_bio():
  109. df = pd.read_csv(bio_file_name, encoding='utf-8')
  110. labels = []
  111. for label in df['bio']:
  112. label = eval(label)
  113. labels.append([l[1] for l in label])
  114. return df['摘要'].tolist(), labels
  115. if __name__ == '__main__':
  116. # patents = yaml.safe_load(open('ds_patents.yml')) or []
  117. none_words = ['马克', '数据网', '微信', '公众号', '马 克 数 据 网', '百度搜索']
  118. # ner_demo_by_deepseek()
  119. # resave_ds_patents_result()
  120. add_bio()