| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197 |
- import re
- import pandas as pd
- import yaml
- import jieba
- import jieba.posseg as pseg
- import polars as pl
- from collections import defaultdict
- from datetime import datetime
- from pathlib import Path
- from scel2text import get_words_from_sogou_cell_dict
- from data_preparation import sample_file_name, columns
- from deepseek import ds_ner
- from env import env
- ds_sample_ner_file_name = 'ds_sample_ner_result.yml'
- building_dict_file_name_sogo = '建筑词汇大全【官方推荐】.scel'
- bio_file_name = '上市公司-专利摘要数据-筛选-样本-BIO.csv'
- entity_types = {
- "结构部件类": 'COM',
- "材料类": 'MAT',
- "材料": 'MAT',
- "技术参数类": 'PAR',
- "技术特征类": 'FEA',
- "制造工艺类": 'MAC',
- "功能属性类": 'FUN',
- "规范标准类": 'REG',
- "专利法律实体类": 'PAT',
- "性能指标类": 'PER',
- "特殊构造类": 'SPE',
- }
- categs = ['结构部件类', '材料类', '技术特征类', '制造工艺类', '特殊构造类']
- class FlowList(list):
- ...
- # 自定义处理 hobbies 字段为流格式
- def flow_list_representer(dumper, data):
- return dumper.represent_sequence('tag:yaml.org,2002:seq', data, flow_style=True)
- # 注册自定义 representer
- yaml.add_representer(FlowList, flow_list_representer)
- def process(row):
- jieba.load_userdict("word_dict.txt")
- bio = yaml.safe_load(open('bio.yml'))
- description = row[3]
- words = pseg.cut(description)
- origin_words = []
- none_words = []
- bio_words = []
- for value in bio.values():
- if value:
- bio_words += value
- for word, flag in words:
- origin_words.append(word)
- if word not in bio_words:
- none_words.append(word)
- return origin_words, none_words
- def ner_sample_by_deepseek():
- patents = []
- df = pl.read_csv(str(env.resolve_output(sample_file_name)), columns=columns, encoding="utf-8")
- df = df.sort('专利申请号')
- results = []
- count = 1
- for row in df.iter_rows():
- try:
- result = ds_ner(row[1], row[3])
- # 去除deepseek返回的yaml格式
- result = result.replace('```yaml\n', '').replace('```', '')
- result = yaml.safe_load(result)
- for t in result['结果']:
- result['结果'][t] = FlowList(result['结果'][t])
- results.append(result)
- patents.append(row[1])
- env.resolve_output(ds_sample_ner_file_name).write_text(yaml.dump(results, allow_unicode=True), encoding='utf-8')
- env.resolve_output('ds_ner_patents.yml').write_text(yaml.dump(patents), encoding='utf-8')
- print(datetime.now(), row[1], count)
- count += 1
- except Exception as e:
- continue
- print('All ok')
- def read_ner_result():
- yaml_str = env.resolve_output(ds_sample_ner_file_name).read_text(encoding='utf-8')
- ner_result = yaml.safe_load(yaml_str)
- return ner_result
- def annotate_bio(text, ner_words):
- '''
- 1. 结构部件类
- 2. 材料类
- 3. 技术参数类
- 4. 技术特征类
- 5. 制造工艺类
- 6. 功能属性类
- 7. 规范标准类
- 8. 专利法律实体类
- 9. 性能指标类
- 10. 特殊构造类
- '''
- tokens = list(text) # 将文本按字拆分
- labels = ["O"] * len(tokens)
- not_found = []
- # 标注
- for entity_name, entity_word in ner_words.items():
- if entity_name not in categs:
- continue
- entity_type = entity_types[entity_name]
- for entity in entity_word:
- entity = re.escape(entity)
- matchs = list(re.finditer(entity, text))
- if not matchs:
- not_found.append(entity)
- for match in matchs:
- start, end = match.start(), match.end()
- labels[start] = f"B-{entity_type}"
- for i in range(start + 1, end):
- labels[i] = f"I-{entity_type}"
- return list(zip(tokens, labels)), not_found
- def add_bio():
- ner_results = read_ner_result()
- for ner_word in ner_results:
- bio_result, not_found = annotate_bio(ner_word['摘要'], ner_word['结果'])
- ner_word['bio'] = str(bio_result)
- ner_word['ner'] = ner_word.pop('结果')
- if not_found:
- ner_word['not_found'] = FlowList(not_found)
- print(ner_word['专利号'], not_found)
- bio_file = str(env.resolve_output(bio_file_name))
- df = pd.DataFrame.from_dict(ner_results) # 转换为 DataFrame
- df.to_csv(bio_file, index=False, encoding="utf-8")
- print('All done.')
- def get_bio():
- df = pd.read_csv(str(env.resolve_output(bio_file_name)), encoding='utf-8')
- labels = []
- for label in df['bio']:
- label = eval(label)
- labels.append([l[1] for l in label])
- return df['专利号'].tolist(), df['摘要'].tolist(), labels
- def check_entity_name():
- ner_result = read_ner_result()
- err_patent = {}
- for ner_word in ner_result:
- text = ner_word['摘要']
- patent = ner_word['专利号']
- non_include = []
- for lables in ner_word['结果'].values():
- for label in lables:
- if label not in text:
- non_include.append(label)
- if non_include:
- err_patent[patent] = non_include
- Path('non_include.yml').write_text(yaml.dump(err_patent, allow_unicode=True), encoding='utf-8')
- def building_dict():
- # 搜狗建筑词汇库
- records = get_words_from_sogou_cell_dict(env.resolve_data(building_dict_file_name_sogo))
- build_words = [r[1] for r in records]
- ner_result = read_ner_result()
- words = []
- for data in ner_result:
- for entity_name, entity_words in data['结果'].items():
- if entity_name not in categs:
- continue
- words.extend(entity_words)
- return set(build_words) | set(words)
- if __name__ == '__main__':
- # building_dict()
- # ner_sample_by_deepseek()
- # resave_ds_patents_result()
- # add_bio()
- # check_entity_name()
- building_dict()
|