bio.py 5.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197
  1. import re
  2. import pandas as pd
  3. import yaml
  4. import jieba
  5. import jieba.posseg as pseg
  6. import polars as pl
  7. from collections import defaultdict
  8. from datetime import datetime
  9. from pathlib import Path
  10. from scel2text import get_words_from_sogou_cell_dict
  11. from data_preparation import sample_file_name, columns
  12. from deepseek import ds_ner
  13. from env import env
  14. ds_sample_ner_file_name = 'ds_sample_ner_result.yml'
  15. building_dict_file_name_sogo = '建筑词汇大全【官方推荐】.scel'
  16. bio_file_name = '上市公司-专利摘要数据-筛选-样本-BIO.csv'
  17. entity_types = {
  18. "结构部件类": 'COM',
  19. "材料类": 'MAT',
  20. "材料": 'MAT',
  21. "技术参数类": 'PAR',
  22. "技术特征类": 'FEA',
  23. "制造工艺类": 'MAC',
  24. "功能属性类": 'FUN',
  25. "规范标准类": 'REG',
  26. "专利法律实体类": 'PAT',
  27. "性能指标类": 'PER',
  28. "特殊构造类": 'SPE',
  29. }
  30. categs = ['结构部件类', '材料类', '技术特征类', '制造工艺类', '特殊构造类']
  31. class FlowList(list):
  32. ...
  33. # 自定义处理 hobbies 字段为流格式
  34. def flow_list_representer(dumper, data):
  35. return dumper.represent_sequence('tag:yaml.org,2002:seq', data, flow_style=True)
  36. # 注册自定义 representer
  37. yaml.add_representer(FlowList, flow_list_representer)
  38. def process(row):
  39. jieba.load_userdict("word_dict.txt")
  40. bio = yaml.safe_load(open('bio.yml'))
  41. description = row[3]
  42. words = pseg.cut(description)
  43. origin_words = []
  44. none_words = []
  45. bio_words = []
  46. for value in bio.values():
  47. if value:
  48. bio_words += value
  49. for word, flag in words:
  50. origin_words.append(word)
  51. if word not in bio_words:
  52. none_words.append(word)
  53. return origin_words, none_words
  54. def ner_sample_by_deepseek():
  55. patents = []
  56. df = pl.read_csv(str(env.resolve_output(sample_file_name)), columns=columns, encoding="utf-8")
  57. df = df.sort('专利申请号')
  58. results = []
  59. count = 1
  60. for row in df.iter_rows():
  61. try:
  62. result = ds_ner(row[1], row[3])
  63. # 去除deepseek返回的yaml格式
  64. result = result.replace('```yaml\n', '').replace('```', '')
  65. result = yaml.safe_load(result)
  66. for t in result['结果']:
  67. result['结果'][t] = FlowList(result['结果'][t])
  68. results.append(result)
  69. patents.append(row[1])
  70. env.resolve_output(ds_sample_ner_file_name).write_text(yaml.dump(results, allow_unicode=True), encoding='utf-8')
  71. env.resolve_output('ds_ner_patents.yml').write_text(yaml.dump(patents), encoding='utf-8')
  72. print(datetime.now(), row[1], count)
  73. count += 1
  74. except Exception as e:
  75. continue
  76. print('All ok')
  77. def read_ner_result():
  78. yaml_str = env.resolve_output(ds_sample_ner_file_name).read_text(encoding='utf-8')
  79. ner_result = yaml.safe_load(yaml_str)
  80. return ner_result
  81. def annotate_bio(text, ner_words):
  82. '''
  83. 1. 结构部件类
  84. 2. 材料类
  85. 3. 技术参数类
  86. 4. 技术特征类
  87. 5. 制造工艺类
  88. 6. 功能属性类
  89. 7. 规范标准类
  90. 8. 专利法律实体类
  91. 9. 性能指标类
  92. 10. 特殊构造类
  93. '''
  94. tokens = list(text) # 将文本按字拆分
  95. labels = ["O"] * len(tokens)
  96. not_found = []
  97. # 标注
  98. for entity_name, entity_word in ner_words.items():
  99. if entity_name not in categs:
  100. continue
  101. entity_type = entity_types[entity_name]
  102. for entity in entity_word:
  103. entity = re.escape(entity)
  104. matchs = list(re.finditer(entity, text))
  105. if not matchs:
  106. not_found.append(entity)
  107. for match in matchs:
  108. start, end = match.start(), match.end()
  109. labels[start] = f"B-{entity_type}"
  110. for i in range(start + 1, end):
  111. labels[i] = f"I-{entity_type}"
  112. return list(zip(tokens, labels)), not_found
  113. def add_bio():
  114. ner_results = read_ner_result()
  115. for ner_word in ner_results:
  116. bio_result, not_found = annotate_bio(ner_word['摘要'], ner_word['结果'])
  117. ner_word['bio'] = str(bio_result)
  118. ner_word['ner'] = ner_word.pop('结果')
  119. if not_found:
  120. ner_word['not_found'] = FlowList(not_found)
  121. print(ner_word['专利号'], not_found)
  122. bio_file = str(env.resolve_output(bio_file_name))
  123. df = pd.DataFrame.from_dict(ner_results) # 转换为 DataFrame
  124. df.to_csv(bio_file, index=False, encoding="utf-8")
  125. print('All done.')
  126. def get_bio():
  127. df = pd.read_csv(str(env.resolve_output(bio_file_name)), encoding='utf-8')
  128. labels = []
  129. for label in df['bio']:
  130. label = eval(label)
  131. labels.append([l[1] for l in label])
  132. return df['专利号'].tolist(), df['摘要'].tolist(), labels
  133. def check_entity_name():
  134. ner_result = read_ner_result()
  135. err_patent = {}
  136. for ner_word in ner_result:
  137. text = ner_word['摘要']
  138. patent = ner_word['专利号']
  139. non_include = []
  140. for lables in ner_word['结果'].values():
  141. for label in lables:
  142. if label not in text:
  143. non_include.append(label)
  144. if non_include:
  145. err_patent[patent] = non_include
  146. Path('non_include.yml').write_text(yaml.dump(err_patent, allow_unicode=True), encoding='utf-8')
  147. def building_dict():
  148. # 搜狗建筑词汇库
  149. records = get_words_from_sogou_cell_dict(env.resolve_data(building_dict_file_name_sogo))
  150. build_words = [r[1] for r in records]
  151. ner_result = read_ner_result()
  152. words = []
  153. for data in ner_result:
  154. for entity_name, entity_words in data['结果'].items():
  155. if entity_name not in categs:
  156. continue
  157. words.extend(entity_words)
  158. return set(build_words) | set(words)
  159. if __name__ == '__main__':
  160. # building_dict()
  161. # ner_sample_by_deepseek()
  162. # resave_ds_patents_result()
  163. # add_bio()
  164. # check_entity_name()
  165. building_dict()