pytorch transformers从头开始实现情感分析模型

网友投稿 261 2022-08-26

pytorch transformers从头开始实现情感分析模型

最近用transformers用的比较多,用transformers加载预训练模型,然后做预测的教程挺多的,但是加载自己的数据进行训练的挺少的,我这里分享一下我的实现:

数据准备

数据来自于kaggle上面情感分析的数据,地址为:​​​ transformersfrom transformers import BertModel, BertTokenizer, AdamW, get_linear_schedule_with_warmupimport torchimport numpy as npimport pandas as pdfrom sklearn.model_selection import train_test_splitfrom sklearn.metrics import confusion_matrix, classification_reportfrom collections import defaultdictfrom textwrap import wrapfrom torch import nn, optimfrom torch.utils.data import Dataset, DataLoaderimport torch.nn.functional as FRANDOM_SEED = 42np.random.seed(RANDOM_SEED)torch.manual_seed(RANDOM_SEED)device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

加载数据,预处理

df = pd.read_csv("archive/googleplaystore_user_reviews.csv")df=df.dropna()def to_sentiment(rating): if rating == 'Positive': return 2 elif rating == 'Neutral': return 1 return 0df['sentiment'] = df.Sentiment.apply(to_sentiment)class_names=["Negative","Neutral","Positive"]df_train, df_test = train_test_split(df, test_size=0.1, random_state=RANDOM_SEED)df_val, df_test = train_test_split(df_test, test_size=0.5, random_state=RANDOM_SEED)

创建dataset和dataloader

class GPReviewDataset(Dataset): def __init__(self, reviews, targets, tokenizer, max_len): self.reviews = reviews self.targets = targets self.tokenizer = tokenizer self.max_len = max_len def __len__(self): return len(self.reviews) def __getitem__(self, item): review = str(self.reviews[item]) target = self.targets[item] encoding = self.tokenizer.encode_plus( review, add_special_tokens=True, max_length=self.max_len, return_token_type_ids=False, pad_to_max_length=True, return_attention_mask=True, return_tensors='pt', )# print(target) return { 'review_text': review, 'input_ids': encoding['input_ids'].flatten(), 'attention_mask': encoding['attention_mask'].flatten(), 'targets': torch.tensor(target, dtype=torch.long) }def create_data_loader(df, tokenizer, max_len, batch_size): ds = GPReviewDataset( reviews=df.Translated_Review.to_numpy(), targets=df.sentiment.to_numpy(), tokenizer=tokenizer, max_len=max_len ) return DataLoader( ds, batch_size=batch_size, num_workers=4 )BATCH_SIZE = 16MAX_LEN = 160PRE_TRAINED_MODEL_NAME = 'bert-base-uncased'tokenizer = BertTokenizer.from_pretrained(PRE_TRAINED_MODEL_NAME)train_data_loader = create_data_loader(df_train, tokenizer, MAX_LEN, BATCH_SIZE)val_data_loader = create_data_loader(df_val, tokenizer, MAX_LEN, BATCH_SIZE)test_data_loader = create_data_loader(df_test, tokenizer, MAX_LEN, BATCH_SIZE)

创建基于BERT的情感分析模型

class SentimentClassifier(nn.Module): def __init__(self, n_classes): super(SentimentClassifier, self).__init__() self.bert = BertModel.from_pretrained(PRE_TRAINED_MODEL_NAME) self.drop = nn.Dropout(p=0.3) self.out = nn.Linear(self.bert.config.hidden_size, n_classes) def forward(self, input_ids, attention_mask): _, pooled_output = self.bert( input_ids=input_ids, attention_mask=attention_mask ) output = self.drop(pooled_output) return self.out(output)

训练和验证

model = SentimentClassifier(len(class_names))model = model.to(device)EPOCHS = 10optimizer = AdamW(model.parameters(), lr=2e-5, correct_bias=False)total_steps = len(train_data_loader) * EPOCHSscheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=0, num_training_steps=total_steps)loss_fn = nn.CrossEntropyLoss().to(device)

训练函数

def train_epoch( model, data_loader, loss_fn, optimizer, device, scheduler, n_examples): model = model.train() losses = [] correct_predictions = 0 for d in data_loader: input_ids = d["input_ids"].to(device) attention_mask = d["attention_mask"].to(device) targets = d["targets"].to(device) outputs = model( input_ids=input_ids, attention_mask=attention_mask ) _, preds = torch.max(outputs, dim=1) loss = loss_fn(outputs, targets) correct_predictions += torch.sum(preds == targets) losses.append(loss.item()) loss.backward() nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0) optimizer.step() scheduler.step() optimizer.zero_grad() return correct_predictions.double() / n_examples, np.mean(losses)

验证函数:

def eval_model(model, data_loader, loss_fn, device, n_examples): model = model.eval() losses = [] correct_predictions = 0 with torch.no_grad(): for d in data_loader: input_ids = d["input_ids"].to(device) attention_mask = d["attention_mask"].to(device) targets = d["targets"].to(device) outputs = model( input_ids=input_ids, attention_mask=attention_mask ) _, preds = torch.max(outputs, dim=1) loss = loss_fn(outputs, targets) correct_predictions += torch.sum(preds == targets) losses.append(loss.item()) return correct_predictions.double() / n_examples, np.mean(losses)

调用他们来进行训练:

history = defaultdict(list)best_accuracy = 0for epoch in range(EPOCHS): print(f'Epoch {epoch + 1}/{EPOCHS}') print('-' * 10) train_acc, train_loss = train_epoch( model, train_data_loader, loss_fn, optimizer, device, scheduler, len(df_train) ) print(f'Train loss {train_loss} accuracy {train_acc}') val_acc, val_loss = eval_model( model, val_data_loader, loss_fn, device, len(df_val) ) print(f'Val loss {val_loss} accuracy {val_acc}') print() history['train_acc'].append(train_acc) history['train_loss'].append(train_loss) history['val_acc'].append(val_acc) history['val_loss'].append(val_loss) if val_acc > best_accuracy: torch.save(model.state_dict(), 'best_model_state.bin') best_accuracy = val_acc

结束,虽然预测只需要加载模型,然后把输入变成id就行了。但是训练过程还是比预测过程麻烦很多,我这里只提供了一种基础方法,还有很多需要完善的地方,欢迎改进。

参考文献

[1].Google Play Store Apps. [2].Sentiment Analysis with BERT and Transformers by Hugging Face using PyTorch and Python. [3].Sentiment Analysis with BERT. https://github.com/curiousily/Getting-Things-Done-with-Pytorch/blob/master/08.sentiment-analysis-with-bert.ipynb

版权声明:本文内容由网络用户投稿,版权归原作者所有,本站不拥有其著作权,亦不承担相应法律责任。如果您发现本站中有涉嫌抄袭或描述失实的内容,请联系我们jiasou666@gmail.com 处理,核实后本网站将在24小时内删除侵权内容。

上一篇:python K-Means算法从头实现
下一篇:“吃鸡”最失败的3个改动!看似是营销鬼才,实则自断业绩来源!
相关文章

 发表评论

暂时没有评论,来抢沙发吧~