transformers 分类器训练

环境说明:

python 3.10,

ubuntu 22.04

pytorch 2.3.0

transformers 4.40.2

问题记录, transformers默认是多gpu训练如果在多GPU机器上运行会有如下问题

1	RuntimeError: NCCL Error 2: unhandled system error (run with NCCL_DEBUG=INFO for details)

需要指定程序的 GPU

1 2	import os os.environ['CUDA_VISIBLE_DEVICES']='2'

step1 导入包

1
2
3

from transformers import AutoModel,AutoModelForSequenceClassification,BertForSequenceClassification
from datasets import load_dataset,Dataset
import pandas as pd

step2 读取数据

pd_data =pd.read_excel("./data/emotion_tag_hjy_0506.xls")
tag_to_label = {
        "悲伤": 0,
        "愤怒": 1,
        "惊讶": 2,
        "恐惧": 3,
        "快乐": 4,
        "其他": 5,
        "厌恶": 6
    }

label_to_tag = {
         0:"悲伤",
         1:"愤怒",
         2:"惊讶",
         3:"恐惧",
         4:"快乐",
         5:"其他",
         6:"厌恶"
    }
pd_data['label']=pd_data["tag"].map(lambda x: tag_to_label[x])
pd_data['content_cn']=pd_data["content_cn"].map(lambda c: c.strip())

step3 数据转换Dataset

dataset = Dataset.from_pandas(pd_data)
dataset = dataset.shuffle(seed=42)
dataset = dataset.filter(lambda x: x["content_cn"] is not None)
dataset = dataset.filter(lambda x: x["label"] is not None)
dataset = dataset.train_test_split(test_size =0.1)

step4 加载模型和tokenizer

from transformers import AutoTokenizer

mode_name="hfl/chinese-macbert-large"
# mode_name="google-bert/bert-base-chinese"
# model = AutoModel.from_pretrained(mode_name)
model = AutoModelForSequenceClassification.from_pretrained(mode_name,num_labels=len(label_to_tag),
        label2id=tag_to_label,
        # problem_type = "single_label_classification",
        id2label=label_to_tag)
tokenizer = AutoTokenizer.from_pretrained(mode_name)

Step5 tokenizer 处理数据

def data_procce_function(example):
    token_exmaple = tokenizer(example['content_cn'],max_length=512,truncation=True)
    token_exmaple['labels'] = example['label']
    return token_exmaple


token_dataset = dataset.map(function=data_procce_function,batched=True,remove_columns=dataset["train"].column_names)
# tokenizer
token_dataset
# token_dataset["train"][0]
#解码显示
tokenizer.decode(token_dataset["train"][0]['input_ids'])

Step6 创建评估函数

对于不同任务（任务类型列表）的评估函数是不一样的在 huggingface 右下角有一个Metrics for Text Classification

import evaluate,numpy 

#这里用准确率函数，如果是多标签就用f1函数 
#GitHub 原例子 https://github.com/huggingface/transformers/tree/main/examples/pytorch/text-classification
acc_metric = evaluate.load("accuracy")
# f1_mectric = evaluate.load("f1")
def eval_metric(eval_predict):
    predictions,labels = eval_predict
    # print(predictions)
    # predictions = predictions.argmax(axis=-1)
    predictions = numpy.argmax(predictions, axis=1)
    result = acc_metric.compute(predictions=predictions, references=labels)
    if len(result) > 1:
        result["combined_score"] = numpy.mean(list(result.values())).item()
    return result

Step7 创建训练参数

from transformers import TrainingArguments

#模型训练参数设置
#具体例子 代码 https://github.com/huggingface/transformers/blob/main/examples/pytorch/text-classification/run_classification.py
train_args = TrainingArguments(
                               
                               output_dir="./output",
                               per_device_eval_batch_size=64,   #验证时的batch_size
                               per_device_train_batch_size=4,   #训练时批次大小
                               gradient_accumulation_steps=32,  # *** 梯度累加 ***
                               optim="adafactor",               # *** adafactor优化器 *** 
                               logging_steps=50,                
                               logging_dir='./logs',
                               logging_strategy='steps',
  
    						 # evaluation_strategy="epoch",     # 评估策略
                               # save_strategy="epoch",         # 保存策略
                               num_train_epochs=5,              # 训练轮数
                               learning_rate=2e-5,              # 学习率
                               weight_decay=0.01,               # weight_decay
                               evaluation_strategy="steps",      # 评估策略
                               # metric_for_best_model="f1",      # 设定评估指标
                               load_best_model_at_end=True        #加载最优模型

                              )
# train_args

  1
2
3
4
5
6
7
8
9
10
11
12
#模型训练
from transformers import Trainer,DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
trainer = Trainer(model=model,
               args=train_args,
               train_dataset=token_dataset['train'],
               eval_dataset=token_dataset['test'],
               data_collator=data_collator,
               compute_metrics=eval_metric
                )
train_result = trainer.train()

Step8模型评估

1 2	metrics = train_result.metrics trainer.log_metrics("train", metrics)

Step 9 模型预测

from transformers import pipeline
#input='4月10日｜阿里巴巴(9988.HK)今日表现强势，盘中最高涨5.82%报74.6港元，股价创3月14日以来新高。消息面上，阿里巴巴创始人马云于阿里内网发帖，高度肯定蔡崇信和吴泳铭组成的新管理层的变革勇气，称阿里巴巴已重回健康成长轨道，并支持继续改革。这是退休后的马云五年来首次长篇幅分享对公司改革创新及未来前景的思考。'
#input='这位辛迪·坎宁安明星说，当被告知她在28年后被解雇时，她“心碎”了......阅读整个故事'
#input='阿尔贝松杂货店似乎发现自己成为了更多抖音诽谤的接收者，但这一次，该应用程序的一些人认为DoorDash可能与一名用户的碎牛肉剧有关。Shantaq(@heyshantaqtv)上传了一段病毒视频，呼吁阿尔贝松和...阅读整个故事'
input='洪水桥房协地盘四级火，起火逾18小时仍然未救熄。　　消防早上继续灌救，不断向洪雅路地盘火场射水，地盘仍然冒起浓烟，出动无人机升空视察火场情况。　　两个天秤一度被指有倒塌风险，屋宇署评估后沒有即时危险。　　有附近居民指有浓烟涌入家中，需关窗和戴上口罩。洪福邨居民何先生：「烟味很臭，我住在2楼也嗅到烟味，很像胶味、刺喉。今天好一些、昨天（星期二）厉害。昨天我不敢上去。」洪福邨居民黄小姐：「嗅到、仍很大烟味。担心味道、始终它烧了很久，烟也真的大。（家中有甚么应对措施？）把窗全都关掉、开冷气'

pipe =pipeline("text-classification", model=model, tokenizer=tokenizer, device=0)
pipe(input)