|
|
@@ -1,16 +1,15 @@
|
|
|
-from pathlib import Path
|
|
|
-
|
|
|
-from transformers import AutoTokenizer, AutoModelForTokenClassification
|
|
|
import torch
|
|
|
+from sympy.physics.units import current
|
|
|
|
|
|
import bio
|
|
|
-
|
|
|
import polars as pl
|
|
|
-from data_preparation import demo_file_name, columns, desc_file_name
|
|
|
-# entity_types = ["Material", "Component", "Equipment", "Process", "Organization", "Standard"]
|
|
|
-
|
|
|
|
|
|
+from transformers import AutoTokenizer, AutoModelForTokenClassification
|
|
|
+from torch.utils.data import Dataset
|
|
|
+from torch.nn import CrossEntropyLoss # 需要导入损失函数
|
|
|
+from data_preparation import columns, sample_file_name, clean_raw_text
|
|
|
from bio import entity_types
|
|
|
+from env import env
|
|
|
|
|
|
label_map = {"O": 0}
|
|
|
index = 1
|
|
|
@@ -27,11 +26,9 @@ label_index = {v: k for k, v in label_map.items()}
|
|
|
# model = AutoModelForTokenClassification.from_pretrained(model_name, num_labels=len(set(label_map)) + 1) # include:“O”
|
|
|
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
|
|
|
|
|
-from torch.utils.data import Dataset
|
|
|
-
|
|
|
-
|
|
|
class PatentNERDataset(Dataset):
|
|
|
- def __init__(self, texts, labels, tokenizer, max_length=512):
|
|
|
+ def __init__(self, index, texts, labels, tokenizer, max_length=512):
|
|
|
+ self.index = index
|
|
|
self.texts = texts
|
|
|
self.labels = labels
|
|
|
self.tokenizer = tokenizer
|
|
|
@@ -51,7 +48,32 @@ class PatentNERDataset(Dataset):
|
|
|
|
|
|
# 转换标签
|
|
|
label_ids = [label_map[label] for label in labels] + [0] * (self.max_length - len(labels))
|
|
|
- return {"input_ids": input_ids, "attention_mask": attention_mask, "labels": torch.tensor(label_ids)}
|
|
|
+ return {"input_ids": input_ids,
|
|
|
+ "attention_mask": attention_mask,
|
|
|
+ "labels": torch.tensor(label_ids),
|
|
|
+ "text": text,
|
|
|
+ "index": self.index[idx]
|
|
|
+ }
|
|
|
+
|
|
|
+def parse_entity(text, predict):
|
|
|
+ entities = []
|
|
|
+ bios = []
|
|
|
+ entity = []
|
|
|
+
|
|
|
+ current_categ = None
|
|
|
+ for word, tensor in zip(text, predict):
|
|
|
+ label_id = tensor.item()
|
|
|
+ label = label_index[label_id]
|
|
|
+ if label == 'O' or (current_categ and not label[2:] == current_categ):
|
|
|
+ if entity:
|
|
|
+ entities.append(''.join(entity))
|
|
|
+ bios.append(current_categ)
|
|
|
+ entity = []
|
|
|
+ current_categ = None
|
|
|
+ continue
|
|
|
+ current_categ = label[2:]
|
|
|
+ entity.append(word)
|
|
|
+ return entities, bios
|
|
|
|
|
|
|
|
|
def train(model_name):
|
|
|
@@ -63,10 +85,10 @@ def train(model_name):
|
|
|
# num_labels=len(set(label_map)) + 1) # include:“O”
|
|
|
|
|
|
# 训练数据
|
|
|
- train_texts, train_labels = bio.get_bio()
|
|
|
+ train_index, train_texts, train_labels = bio.get_bio()
|
|
|
|
|
|
# 创建数据集
|
|
|
- dataset = PatentNERDataset(train_texts, train_labels, tokenizer)
|
|
|
+ dataset = PatentNERDataset(train_index, train_texts, train_labels, tokenizer)
|
|
|
|
|
|
|
|
|
from torch.utils.data import DataLoader
|
|
|
@@ -77,6 +99,11 @@ def train(model_name):
|
|
|
optimizer = AdamW(model.parameters(), lr=5e-5)
|
|
|
train_loader = DataLoader(dataset, batch_size=2, shuffle=True)
|
|
|
|
|
|
+ # 创建一个损失函数实例,用于单独计算样本损失
|
|
|
+ # reduction='none' 表示不进行聚合,返回每个元素的损失
|
|
|
+ # reduction='mean' 在这里用于计算单个样本所有 token 的平均损失
|
|
|
+ loss_fct = CrossEntropyLoss(ignore_index=-100, reduction='mean') # 'mean' 计算单个样本的平均loss
|
|
|
+
|
|
|
# 训练循环
|
|
|
for epoch in range(4):
|
|
|
for batch in train_loader:
|
|
|
@@ -88,16 +115,59 @@ def train(model_name):
|
|
|
optimizer.step()
|
|
|
print(f"Epoch {epoch}, Loss: {loss.item()}")
|
|
|
|
|
|
+ original_texts = batch["text"]
|
|
|
+ original_bios = batch["labels"]
|
|
|
+
|
|
|
+ # 获取模型的 logits (预测分数)
|
|
|
+ logits = outputs.logits # Shape: (batch_size, sequence_length, num_labels)
|
|
|
+
|
|
|
+ # --- 单独计算每个样本的损失 ---
|
|
|
+ sample_losses = []
|
|
|
+ for i in range(logits.size(0)): # 遍历批次中的每个样本
|
|
|
+ # 提取单个样本的 logits 和 labels
|
|
|
+ sample_logits = logits[i] # Shape: (sequence_length, num_labels)
|
|
|
+ sample_labels = labels[i] # Shape: (sequence_length)
|
|
|
+
|
|
|
+ # 检查是否存在有效标签,避免除以零或 NaN
|
|
|
+ if (sample_labels != -100).sum() > 0:
|
|
|
+ # 使用 loss_fct 计算该样本的平均损失
|
|
|
+ # CrossEntropyLoss 需要 (N, C) 和 (N) 格式,这里 N=sequence_length, C=num_labels
|
|
|
+ individual_loss = loss_fct(sample_logits, sample_labels)
|
|
|
+ sample_losses.append((individual_loss.item(), i))
|
|
|
+ else:
|
|
|
+ # 如果样本没有有效标签(可能全是padding或特殊标记),损失设为0
|
|
|
+ sample_losses.append((0.0, i))
|
|
|
+
|
|
|
+ # --- 按损失值降序排序 ---
|
|
|
+ sample_losses.sort(key=lambda x: x[0], reverse=True)
|
|
|
+
|
|
|
+ # --- 打印损失最高的样本信息 ---
|
|
|
+ for individual_loss_value, index in sample_losses:
|
|
|
+ # 只打印损失大于某个值的样本,或者打印前 N 个
|
|
|
+ # if individual_loss_value > 0.1: # 可以加一个过滤条件
|
|
|
+ print(f" - Sample Index in Batch: {batch['index'][index]}")
|
|
|
+ print(f" Individual Avg Loss: {individual_loss_value:.4f}")
|
|
|
+ print(f" Original Text: {original_texts[index]}") # 打印部分文本
|
|
|
+ # print(f" Original BIO : {' '.join(original_bios[index][:20])}...") # 打印部分BIO
|
|
|
+ # 可选:进行预测并打印对比,帮助分析错误
|
|
|
+ with torch.no_grad():
|
|
|
+ pred_labels_ids = torch.argmax(logits[index], dim=-1)
|
|
|
+
|
|
|
+ pred_labels, _ = parse_entity(original_texts[index], pred_labels_ids)
|
|
|
+ true_labels, _ = parse_entity(original_texts[index], labels[index])
|
|
|
+ print(f" Predicted BIO: {' '.join(pred_labels)}")
|
|
|
+ print(f" True BIO : {' '.join(true_labels)}")
|
|
|
+
|
|
|
# 保存训练模型
|
|
|
# model.save_pretrained("building_ner_model")
|
|
|
# tokenizer.save_pretrained("building_ner_model")
|
|
|
# model.save_pretrained("building_ner_model_bert_wwm")
|
|
|
# tokenizer.save_pretrained("building_ner_model_bert_wwm")
|
|
|
- model.save_pretrained(f"{model_name}-building")
|
|
|
+ model.save_pretrained(env.resolve_output(f"{model_name}-building"))
|
|
|
tokenizer.save_pretrained(f"{model_name}-building")
|
|
|
|
|
|
|
|
|
-def test(model_name):
|
|
|
+def predict_test(model_name):
|
|
|
# 加载训练模型
|
|
|
# model = AutoModelForTokenClassification.from_pretrained("building_ner_model")
|
|
|
# tokenizer = AutoTokenizer.from_pretrained("building_ner_model")
|
|
|
@@ -137,14 +207,14 @@ def test(model_name):
|
|
|
print("\n")
|
|
|
return predictions
|
|
|
|
|
|
- df = pl.read_csv(str(Path(desc_file_name).expanduser()), columns=columns, encoding="utf-8")
|
|
|
+ df = pl.read_csv(str(env.resolve_data(sample_file_name)), columns=columns, encoding="utf-8")
|
|
|
|
|
|
for description in df['摘要']:
|
|
|
- description = description.replace(r'\r', ' ').replace(r'\n', ' ').replace(r'\t', '').replace(' ', '')
|
|
|
+ description = clean_raw_text(description)
|
|
|
predict_distilbert(description)
|
|
|
|
|
|
if __name__ == '__main__':
|
|
|
- train('hfl/chinese-roberta-wwm-ext-large-building')
|
|
|
+ # train('hfl/chinese-roberta-wwm-ext-large-building')
|
|
|
# train('hfl/chinese-roberta-wwm-ext-large')
|
|
|
- # test('hfl/chinese-roberta-wwm-ext-large-building')
|
|
|
+ predict_test('hfl/chinese-roberta-wwm-ext-large-building-building')
|
|
|
# test()
|