c语言sscanf函数的用法是什么
261
2022-08-26
pytorch transformers从头开始实现情感分析模型
最近用transformers用的比较多,用transformers加载预训练模型,然后做预测的教程挺多的,但是加载自己的数据进行训练的挺少的,我这里分享一下我的实现:
数据准备
数据来自于kaggle上面情感分析的数据,地址为: transformersfrom transformers import BertModel, BertTokenizer, AdamW, get_linear_schedule_with_warmupimport torchimport numpy as npimport pandas as pdfrom sklearn.model_selection import train_test_splitfrom sklearn.metrics import confusion_matrix, classification_reportfrom collections import defaultdictfrom textwrap import wrapfrom torch import nn, optimfrom torch.utils.data import Dataset, DataLoaderimport torch.nn.functional as FRANDOM_SEED = 42np.random.seed(RANDOM_SEED)torch.manual_seed(RANDOM_SEED)device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
加载数据,预处理
df = pd.read_csv("archive/googleplaystore_user_reviews.csv")df=df.dropna()def to_sentiment(rating): if rating == 'Positive': return 2 elif rating == 'Neutral': return 1 return 0df['sentiment'] = df.Sentiment.apply(to_sentiment)class_names=["Negative","Neutral","Positive"]df_train, df_test = train_test_split(df, test_size=0.1, random_state=RANDOM_SEED)df_val, df_test = train_test_split(df_test, test_size=0.5, random_state=RANDOM_SEED)
创建dataset和dataloader
class GPReviewDataset(Dataset): def __init__(self, reviews, targets, tokenizer, max_len): self.reviews = reviews self.targets = targets self.tokenizer = tokenizer self.max_len = max_len def __len__(self): return len(self.reviews) def __getitem__(self, item): review = str(self.reviews[item]) target = self.targets[item] encoding = self.tokenizer.encode_plus( review, add_special_tokens=True, max_length=self.max_len, return_token_type_ids=False, pad_to_max_length=True, return_attention_mask=True, return_tensors='pt', )# print(target) return { 'review_text': review, 'input_ids': encoding['input_ids'].flatten(), 'attention_mask': encoding['attention_mask'].flatten(), 'targets': torch.tensor(target, dtype=torch.long) }def create_data_loader(df, tokenizer, max_len, batch_size): ds = GPReviewDataset( reviews=df.Translated_Review.to_numpy(), targets=df.sentiment.to_numpy(), tokenizer=tokenizer, max_len=max_len ) return DataLoader( ds, batch_size=batch_size, num_workers=4 )BATCH_SIZE = 16MAX_LEN = 160PRE_TRAINED_MODEL_NAME = 'bert-base-uncased'tokenizer = BertTokenizer.from_pretrained(PRE_TRAINED_MODEL_NAME)train_data_loader = create_data_loader(df_train, tokenizer, MAX_LEN, BATCH_SIZE)val_data_loader = create_data_loader(df_val, tokenizer, MAX_LEN, BATCH_SIZE)test_data_loader = create_data_loader(df_test, tokenizer, MAX_LEN, BATCH_SIZE)
创建基于BERT的情感分析模型
class SentimentClassifier(nn.Module): def __init__(self, n_classes): super(SentimentClassifier, self).__init__() self.bert = BertModel.from_pretrained(PRE_TRAINED_MODEL_NAME) self.drop = nn.Dropout(p=0.3) self.out = nn.Linear(self.bert.config.hidden_size, n_classes) def forward(self, input_ids, attention_mask): _, pooled_output = self.bert( input_ids=input_ids, attention_mask=attention_mask ) output = self.drop(pooled_output) return self.out(output)
训练和验证
model = SentimentClassifier(len(class_names))model = model.to(device)EPOCHS = 10optimizer = AdamW(model.parameters(), lr=2e-5, correct_bias=False)total_steps = len(train_data_loader) * EPOCHSscheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=0, num_training_steps=total_steps)loss_fn = nn.CrossEntropyLoss().to(device)
训练函数
def train_epoch( model, data_loader, loss_fn, optimizer, device, scheduler, n_examples): model = model.train() losses = [] correct_predictions = 0 for d in data_loader: input_ids = d["input_ids"].to(device) attention_mask = d["attention_mask"].to(device) targets = d["targets"].to(device) outputs = model( input_ids=input_ids, attention_mask=attention_mask ) _, preds = torch.max(outputs, dim=1) loss = loss_fn(outputs, targets) correct_predictions += torch.sum(preds == targets) losses.append(loss.item()) loss.backward() nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0) optimizer.step() scheduler.step() optimizer.zero_grad() return correct_predictions.double() / n_examples, np.mean(losses)
验证函数:
def eval_model(model, data_loader, loss_fn, device, n_examples): model = model.eval() losses = [] correct_predictions = 0 with torch.no_grad(): for d in data_loader: input_ids = d["input_ids"].to(device) attention_mask = d["attention_mask"].to(device) targets = d["targets"].to(device) outputs = model( input_ids=input_ids, attention_mask=attention_mask ) _, preds = torch.max(outputs, dim=1) loss = loss_fn(outputs, targets) correct_predictions += torch.sum(preds == targets) losses.append(loss.item()) return correct_predictions.double() / n_examples, np.mean(losses)
调用他们来进行训练:
history = defaultdict(list)best_accuracy = 0for epoch in range(EPOCHS): print(f'Epoch {epoch + 1}/{EPOCHS}') print('-' * 10) train_acc, train_loss = train_epoch( model, train_data_loader, loss_fn, optimizer, device, scheduler, len(df_train) ) print(f'Train loss {train_loss} accuracy {train_acc}') val_acc, val_loss = eval_model( model, val_data_loader, loss_fn, device, len(df_val) ) print(f'Val loss {val_loss} accuracy {val_acc}') print() history['train_acc'].append(train_acc) history['train_loss'].append(train_loss) history['val_acc'].append(val_acc) history['val_loss'].append(val_loss) if val_acc > best_accuracy: torch.save(model.state_dict(), 'best_model_state.bin') best_accuracy = val_acc
结束,虽然预测只需要加载模型,然后把输入变成id就行了。但是训练过程还是比预测过程麻烦很多,我这里只提供了一种基础方法,还有很多需要完善的地方,欢迎改进。
参考文献
[1].Google Play Store Apps. [2].Sentiment Analysis with BERT and Transformers by Hugging Face using PyTorch and Python. [3].Sentiment Analysis with BERT. https://github.com/curiousily/Getting-Things-Done-with-Pytorch/blob/master/08.sentiment-analysis-with-bert.ipynb
版权声明:本文内容由网络用户投稿,版权归原作者所有,本站不拥有其著作权,亦不承担相应法律责任。如果您发现本站中有涉嫌抄袭或描述失实的内容,请联系我们jiasou666@gmail.com 处理,核实后本网站将在24小时内删除侵权内容。
发表评论
暂时没有评论,来抢沙发吧~