bio.py 6.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209
  1. import re
  2. from collections import defaultdict
  3. from datetime import datetime
  4. import yaml
  5. import jieba
  6. import jieba.posseg as pseg
  7. import polars as pl
  8. from pathlib import Path
  9. from data_preparation import demo_file_name, columns, read_ner_result
  10. from deepseek import ds_ner
  11. import pandas as pd
  12. import bio
  13. from scel2text import get_words_from_sogou_cell_dict
  14. building_dict_file_name_sogo = '~/Documents/论文/建筑词汇大全【官方推荐】.scel'
  15. bio_file_name = '~/Documents/论文/上市公司-建筑工业化-专利摘要数据-样本-BIO.csv'
  16. entity_types = {
  17. "结构部件类": 'COM',
  18. "材料类": 'MAT',
  19. "材料": 'MAT',
  20. "技术参数类": 'PAR',
  21. "技术特征类": 'FEA',
  22. "制造工艺类": 'MAC',
  23. "功能属性类": 'FUN',
  24. "规范标准类": 'REG',
  25. "专利法律实体类": 'PAT',
  26. "性能指标类": 'PER',
  27. "特殊构造类": 'SPE',
  28. }
  29. categs = ['结构部件类', '材料类', '技术特征类', '制造工艺类', '特殊构造类']
  30. class FlowList(list):
  31. ...
  32. # 自定义处理 hobbies 字段为流格式
  33. def custom_hobbies_representer(dumper, data):
  34. return dumper.represent_sequence('tag:yaml.org,2002:seq', data, flow_style=True)
  35. # 注册自定义 representer
  36. yaml.add_representer(FlowList, custom_hobbies_representer)
  37. # "CN200910091292.3"
  38. all_words = defaultdict(list)
  39. def process(row):
  40. jieba.load_userdict("word_dict.txt")
  41. bio = yaml.safe_load(open('bio.yml'))
  42. description = row[3]
  43. words = pseg.cut(description)
  44. origin_words = []
  45. none_words = []
  46. bio_words = []
  47. for value in bio.values():
  48. if value:
  49. bio_words += value
  50. for word, flag in words:
  51. origin_words.append(word)
  52. if word not in bio_words:
  53. none_words.append(word)
  54. return origin_words, none_words
  55. def ner_demo_by_deepseek():
  56. patents = []
  57. df = pl.read_csv(str(Path(demo_file_name).expanduser()), columns=columns, encoding="utf-8")
  58. df = df.sort('专利申请号')
  59. results = []
  60. count = 1
  61. for row in df.iter_rows():
  62. try:
  63. result = ds_ner(row[1], row[3])
  64. # 去除deepseek返回的yaml格式
  65. result = result.replace('```yaml\n', '').replace('```', '')
  66. result = yaml.safe_load(result)
  67. for t in result['结果']:
  68. result['结果'][t] = FlowList(result['结果'][t])
  69. results.append(result)
  70. patents.append(row[1])
  71. Path('ds_patents_result.yml').write_text(yaml.dump(results, allow_unicode=True), encoding='utf-8')
  72. Path('ds_patents.yml').write_text(yaml.dump(patents), encoding='utf-8')
  73. print(datetime.now(), row[1], count)
  74. count += 1
  75. except Exception as e:
  76. continue
  77. print('All ok')
  78. def resave_ds_patents_result():
  79. ner_result = read_ner_result()
  80. for ns in ner_result:
  81. for t in ns['结果']:
  82. ns['结果'][t] = FlowList(ns['结果'][t])
  83. result_str = yaml.dump(ner_result, allow_unicode=True)
  84. Path('ds_patents_result_resave.yml').write_text(result_str, encoding='utf-8')
  85. def annotate_bio(text, ner_words):
  86. '''
  87. 1. 结构部件类
  88. 2. 材料类
  89. 3. 技术参数类
  90. 4. 技术特征类
  91. 5. 制造工艺类
  92. 6. 功能属性类
  93. 7. 规范标准类
  94. 8. 专利法律实体类
  95. 9. 性能指标类
  96. 10. 特殊构造类
  97. '''
  98. tokens = list(text) # 将文本按字拆分
  99. labels = ["O"] * len(tokens)
  100. not_found = []
  101. # 标注
  102. for entity_name, entity_word in ner_words.items():
  103. if entity_name not in categs:
  104. continue
  105. entity_type = entity_types[entity_name]
  106. for entity in entity_word:
  107. entity = re.escape(entity)
  108. matchs = list(re.finditer(entity, text))
  109. if not matchs:
  110. not_found.append(entity)
  111. for match in matchs:
  112. start, end = match.start(), match.end()
  113. labels[start] = f"B-{entity_type}"
  114. for i in range(start + 1, end):
  115. labels[i] = f"I-{entity_type}"
  116. return list(zip(tokens, labels)), not_found
  117. def add_bio():
  118. ner_results = read_ner_result()
  119. for ner_word in ner_results:
  120. bio_result, not_found = annotate_bio(ner_word['摘要'], ner_word['结果'])
  121. ner_word['bio'] = str(bio_result)
  122. ner_word['ner'] = ner_word.pop('结果')
  123. if not_found:
  124. ner_word['not_found'] = FlowList(not_found)
  125. print(ner_word['专利号'], not_found)
  126. bio_file = str(Path(bio_file_name).expanduser())
  127. # for ns in ner_results:
  128. # for t in ns['ner']:
  129. # ns['ner'][t] = FlowList(ns['ner'][t])
  130. # result_str = yaml.dump(ner_results, allow_unicode=True)
  131. # Path(bio_file_name).expanduser().write_text(result_str, encoding='utf-8')
  132. df = pd.DataFrame.from_dict(ner_results) # 转换为 DataFrame
  133. df.to_csv(bio_file, index=False, encoding="utf-8")
  134. print('All done.')
  135. def get_bio():
  136. df = pd.read_csv(bio_file_name, encoding='utf-8')
  137. labels = []
  138. for label in df['bio']:
  139. label = eval(label)
  140. labels.append([l[1] for l in label])
  141. return df['摘要'].tolist(), labels
  142. def check_entity_name():
  143. ner_result = read_ner_result()
  144. err_patent = {}
  145. for ner_word in ner_result:
  146. text = ner_word['摘要']
  147. patent = ner_word['专利号']
  148. non_include = []
  149. for lables in ner_word['结果'].values():
  150. for label in lables:
  151. if label not in text:
  152. non_include.append(label)
  153. if non_include:
  154. err_patent[patent] = non_include
  155. Path('non_include.yml').write_text(yaml.dump(err_patent, allow_unicode=True), encoding='utf-8')
  156. def building_dict():
  157. # 搜狗建筑词汇库
  158. records = get_words_from_sogou_cell_dict(Path(building_dict_file_name_sogo).expanduser())
  159. build_words = [r[1] for r in records]
  160. ner_result = read_ner_result()
  161. words = []
  162. for data in ner_result:
  163. for entity_name, entity_words in data['结果'].items():
  164. if entity_name not in categs:
  165. continue
  166. words.extend(entity_words)
  167. return set(build_words) | set(words)
  168. if __name__ == '__main__':
  169. # building_dict()
  170. # ner_demo_by_deepseek()
  171. # resave_ds_patents_result()
  172. # add_bio()
  173. # check_entity_name()
  174. building_dict()