AIVIVN Product Review Sentiment Analysis [Pytorch Lightning Sample]

Tutorials / Implementations
NLP
Training the sentiment classifier (TextCNN) for AIVIVN product review dataset using Pytorch Lightning.
Published

August 17, 2021

from google.colab import drive
drive.mount('/content/drive')
Mounted at /content/drive

Install required packages

%%capture
!pip install pytorch-lightning
!pip install torchmetrics
!pip install pyvi 
!pip install torch-summary

Import required packages

import re
import copy
from pathlib import Path
from typing import Any, Dict, List, Optional, Tuple, Union
from os.path import abspath
import torchmetrics
import pandas as pd

Have a closer look at the dataset

The data contains user’s reviews following two categories: “positive” and “negative”. There are 27068 sentences in total. * Train: 16087 sentences * Test: 10981 sentences (public: 5454 sentences, private: 5527 sentences) * Labels: 0 (positive), 1 (negative)

You can download the dataset here.

train_path = "/content/drive/MyDrive/SLSOPS/dataset/Aivivn_vietnamese_dataset/train.crash"
test_path = "/content/drive/MyDrive/SLSOPS/dataset/Aivivn_vietnamese_dataset/test.crash"
def split_array(arr, condition):
    if len(arr) == 0:
        return []
    result = []
    accumulated = [arr[0]]
    for ele in arr[1:]:
        if condition(ele):
            result.append(copy.deepcopy(accumulated))
            accumulated = [copy.deepcopy(ele)]
        else:
            accumulated.append(copy.deepcopy(ele))
    result.append(copy.deepcopy(accumulated))
    return result


def read_file(file_path, is_train=True):
    file_path = abspath(file_path)
    data_lines = list(
        filter(lambda x: x != '', open(file_path).read().split('\n')))
    pattern = ('train' if is_train else 'test') + '_[0-9]{5}'
    datas = split_array(data_lines, lambda x: bool(re.match(pattern, x)))
    if is_train:
        result_array = list(map(
            lambda x: [x[0], ' '.join(x[1:-1]), int(x[-1])], datas))
    else:
        result_array = list(map(lambda x: [x[0], ' '.join(x[1:])], datas))
    columns = ['name', 'text', 'label'] if is_train else ['name', 'text']
    return pd.DataFrame(result_array, columns=columns)
train_df = read_file(train_path)
test_df = read_file(test_path, is_train=False)
# Having a look at the dataset 0: Postitive, 1: Negative
train_df.head()
name text label
0 train_000000 "Dung dc sp tot cam on shop Đóng gói sản phẩm... 0
1 train_000001 " Chất lượng sản phẩm tuyệt vời . Son mịn nhưn... 0
2 train_000002 " Chất lượng sản phẩm tuyệt vời nhưng k có hộp... 0
3 train_000003 ":(( Mình hơi thất vọng 1 chút vì mình đã kỳ v... 1
4 train_000004 "Lần trước mình mua áo gió màu hồng rất ok mà ... 1
# Having a look at the test set
test_df.head()
name text
0 test_000000 "Chưa dùng thử nên chưa biết"
1 test_000001 " Không đáng tiềnVì ngay đợt sale nên mới mua ...
2 test_000002 "Cám ơn shop. Đóng gói sản phẩm rất đẹp và chắ...
3 test_000003 "Vải đẹp.phom oki luôn.quá ưng"
4 test_000004 "Chuẩn hàng đóng gói đẹp"

Define dataset and dataloader classes

from typing import List, Tuple
import torchtext
from collections import Counter, OrderedDict
from torch.nn.utils.rnn import pad_sequence
from torchtext.vocab import Vectors, Vocab

class Tokenizer():
    def __init__(self, tokenizer: Any):
        self.counter = Counter(['<pad>', '<unk>'])
        self.tokenizer = tokenizer
        self.vocab = None
        self.update_vocab()
    
    def update_vocab(self):
        # sorted_by_freq_tuples = sorted(self.counter.items()[2:], key=lambda x: x[1], reverse=True)
        ordered_dict = OrderedDict(self.counter.items())
        self.vocab = torchtext.vocab.vocab(ordered_dict, min_freq=1)

    def fit_on_texts(self, texts: List[str]):
        """
        Updates internal vocabulary based on a list of texts.
        """
        for text in texts:
            tokens = [t.text for t in self.tokenizer(text)] 
            self.counter.update(tokens)
        self.update_vocab()
    
    def texts_to_sequences(self, texts: List[str], tensor: bool=True) -> List[int]:
        word2idx = self.vocab.get_stoi()
        sequences = []
        for text in texts:
            seq = [word2idx.get(token.text, word2idx['<unk>']) for token in self.tokenizer(text)]
            if tensor:
                seq = torch.tensor(seq)
            sequences.append(seq)
        return sequences

def _load_data_from(data_path: Union[str, Path]):
    df = read_file(data_path)
    sents = list(df['text'].str.strip().str.lower())
    sentiments = list(df['label'])
    return sents, sentiments

def _save_to_csv(file_path: Union[str, Path], data):
    sents, sentiments = data
    df = pd.DataFrame({
        "text": sents,
        "label": sentiments,
    })
    df.to_csv(file_path, index=False)
    return file_path

def _preprocess_data(data: Tuple[List[str], List[str]], tokenizer: Tokenizer):
    sentences, sentiments = data
    sequences = tokenizer.texts_to_sequences(sentences)
    sentiment_tensor = torch.tensor(sentiments)
    # pad sequences
    sequences = pad_sequence(sequences, batch_first=True)
    assert len(sequences) == len(sentiments)
    all_data = []
    for i in range(len(sentiments)):
        sample = {
            'sequence': sequences[i],
            'sentiment': sentiment_tensor[i]
        }
        all_data.append(sample)
    return all_data

def build_vocab(tokenizer, data):
    sentences = data[0]
    tokenizer.fit_on_texts(sentences)
from gensim.models import KeyedVectors
from gensim.test.utils import datapath
import numpy as np

def load_pretrained_word_embeddings(w2v_path: str):
    return KeyedVectors.load_word2vec_format(datapath(w2v_path), binary=False)

def create_embedding_matrix(w2v_model, vocab: Vocab, path: Union[str, Path]):
    if os.path.exists(path):
        print(f'loading embedding matrix from {path}')
        embedding_matrix = pickle.load(open(path, 'rb'))
    else:
        # Calculate vector for OOV token
        OOV_vec = torch.from_numpy(np.mean(w2v_model.vectors, axis=0))
        embedding_matrix = torch.zeros((len(vocab), w2v_model.vector_size), 
                                       dtype=torch.float)

        # words that are not availabel in the pretrained word embeddings will be zeros
        for word, index in vocab.get_stoi().items():
            if word in w2v_model.vocab:
                embedding_matrix[index] = torch.from_numpy(w2v_model[word])
            else:
                if word == "<pad>":
                    continue
                embedding_matrix[index] = OOV_vec

        # save embedding matrix
        pickle.dump(embedding_matrix, open(path, 'wb'))
    return embedding_matrix

Now we create the datamodule class for the dataset using Pytorch-Lightning Framework. You can read more here.

import torch
from torch.utils.data import DataLoader, Dataset, random_split
import pytorch_lightning as pl

class AIVIVNDataset(Dataset):
    def __init__(self, data):
        self.data = data
    def __len__(self):
        return len(self.data)
    def __getitem__(self, idx):
        return self.data[idx]

class AIVIVN(pl.LightningDataModule):
    
    def __init__(self, tokenizer, opts: Dict[str, Any]):
        super().__init__()
        self.tokenizer = tokenizer
        self.batch_size = opts['batch_size']
        self.num_workers = opts['num_workers']
        self.on_gpu = opts['on_gpu']
        self.train_ds = None
        self.val_ds = None

        self.mapping = {"negative": 1, "positive": 0}
        self.inverse_mapping = {v: k for k, v in enumerate(self.mapping)}
    
    def prepare_data(self, *args, **kwargs) -> None:
        self.train_path = '/content/drive/MyDrive/SLSOPS/dataset/Aivivn_vietnamese_dataset/train.crash'

    def setup(self, stage: str = None) -> None:
        if stage == "fit" or stage is None:
            # Load data from files
            train_data = _load_data_from(self.train_path)
            preprocessed_data = _preprocess_data(train_data, self.tokenizer)
            dataset = AIVIVNDataset(preprocessed_data)
            lengths = [int(len(dataset)*0.85), len(dataset) - int(len(dataset)*0.85)]
            self.train_ds, self.val_ds = random_split(dataset, lengths)

    def train_dataloader(self):
        return DataLoader(
            self.train_ds,
            shuffle=True,
            batch_size=self.batch_size,
            num_workers=self.num_workers,
            pin_memory=self.on_gpu
        )
    
    def val_dataloader(self):
        return DataLoader(
            self.val_ds,
            shuffle=False,
            batch_size=self.batch_size,
            num_workers=self.num_workers,
            pin_memory=self.on_gpu,
        )
    
    def __repr__(self):
        basic = f"AIVIVN Product Review Dataset\nNum classes: {len(self.mapping)}\nMapping: {self.mapping}\n"
        if self.train_ds is None and self.val_ds is None:
            return basic
        batch = next(iter(self.train_dataloader()))
        sequences, sentiments = batch['sequence'], batch['sentiment']
        data = (
            f"Train/val sizes: {len(self.train_ds)}, {len(self.val_ds)}\n"
            f"Batch sequences stats: {(sequences.shape, sequences.dtype)}\n"
            f"Batch sentiments stats: {(sentiments.shape, sentiments.dtype)}\n"
        )
        return basic + data

Implementation (TextCNN)

import torch.nn as nn
import torch.nn.functional as F

class ConvPool(nn.Module):
    def __init__(self, in_channels, out_channels, conv_kernel_sz, pool_kernel_sz):
        super(ConvPool, self).__init__()
        self.conv = nn.Conv1d(in_channels=in_channels, out_channels=out_channels, kernel_size=conv_kernel_sz)
        self.pool = nn.MaxPool1d(pool_kernel_sz)

    def forward(self, x):
        out = self.conv(x)
        out = F.relu(out)
        out = self.pool(out)
        return out


class TextCNN(pl.LightningModule):
    def __init__(self, embeddings, num_classes=2, batch_first=True, lr=1e-3, dropout=0, l2reg=0.01):
        super().__init__()
        embedding_dim = embeddings.shape[1]
        self.embedding = nn.Embedding.from_pretrained(embeddings)
        kernel_sizes = [3,4,5]
        self.filters = nn.ModuleList([ConvPool(embedding_dim, 128, conv_kernel_sz=conv_kernel_size, pool_kernel_sz=5) for conv_kernel_size in kernel_sizes])
        self.conv_pool1 = ConvPool(128, 128, 5, 5)
        self.conv_pool2 = ConvPool(128, 128, 5, 30)
        self.flatten = nn.Flatten(start_dim=1)
        self.linear1 = nn.Linear(256, 128)
        self.linear2 = nn.Linear(128, num_classes)

        self.lr = lr
        self.l2reg = l2reg

        self.train_acc = torchmetrics.Accuracy()
        self.val_acc = torchmetrics.Accuracy()
        self.val_f1 = torchmetrics.F1(num_classes=2, average='macro')
        self.test_acc = torchmetrics.Accuracy()
        self.test_f1 = torchmetrics.F1(num_classes=2, average='macro')
    
    def configure_optimizers(self):
        optim = torch.optim.Adam(self.parameters(), lr=self.lr)
        return optim

    def forward(self, input):
        sequences = input['sequence'] # BxS
        embeds = self.embedding(sequences).permute(0, 2, 1) # BxSxH
        out_1 = self.filters[0](embeds)
        out_2 = self.filters[1](embeds)
        out_3 = self.filters[2](embeds)
        out = torch.cat((out_1, out_2, out_3), dim=2)
        out = self.conv_pool1(out)
        out = self.conv_pool2(out)
        out = self.flatten(out)
        out = self.linear1(out)
        out = F.relu(out)
        logit = self.linear2(out)
        return logit
    
    def training_step(self, batch, batch_idx):
        sentiments = batch['sentiment']
        logits = self.forward(batch)
        loss = F.cross_entropy(logits, sentiments)
        scores = F.softmax(logits, dim=-1)
        self.train_acc(scores, sentiments)
        self.log('train_loss', loss, on_step=True, on_epoch=True, prog_bar=True)
        self.log('train_acc', self.train_acc, on_step=False, on_epoch=True, prog_bar=True, logger=True)
        return loss
    
    def validation_step(self, batch, batch_idx):  # pylint: disable=unused-argument
        sentiments = batch['sentiment']
        logits = self.forward(batch)
        scores = F.softmax(logits, dim=-1)
        self.val_acc(scores, sentiments)
        self.val_f1(scores, sentiments)
        self.log('val_loss', loss, on_step=True, on_epoch=True, prog_bar=True)
        self.log('val_acc', self.val_acc, on_step=False, on_epoch=True, prog_bar=True, logger=True)
        self.log('val_f1', self.val_f1, on_step=False, on_epoch=True, prog_bar=True, logger=True)
    
    def test_step(self, batch, batch_idx):  # pylint: disable=unused-argument
        sentiments = batch['sentiment']
        logits = self.forward(batch)
        scores = F.softmax(logits, dim=-1)
        self.test_acc(scores, sentiments)
        self.test_f1(scores, sentiments)
        self.log('test_acc', self.test_acc, on_step=False, on_epoch=True, logger=True)
        self.log('test_f1', self.test_f1, on_step=False, on_epoch=True, logger=True)
    

nn.ModuleList does not have a forward() method because it does not define any neural network, that is, there is no connection between each of the nn.Module’s that it stores. You may use it to store nn.Module’s, just like you use Python lists to store other types of objects (integers, strings, etc).

The advantage of using nn.ModuleList instead of using conventional Python lists to store nn.Module’s is that Pytorch is “aware” of the existence of the nn.Module’s inside an nn.ModuleList, which is not the case for Python lists. When using a Python list instead of a nn.ModuleList, the optimizer() will raise the error saying that the model has no parameters. This is because PyTorch does not see the parameters of the layers stored in a Python list. If you use a nn.ModuleList instead, you’ll get no error.

Training

# Load pretrained w2v model
w2v_path = "/content/drive/MyDrive/SLSOPS/pretrained_w2v/word2vec_vi_words_100dims.txt"
w2v_model = load_pretrained_word_embeddings(w2v_path)
# Load dataset
train_path = '/content/drive/MyDrive/SLSOPS/dataset/Aivivn_vietnamese_dataset/train.crash'
train_data = _load_data_from(train_path)
# Create Tokenizer
from spacy.lang.vi import Vietnamese
nlp = Vietnamese()
tokenizer = Tokenizer(nlp)

# Build vocabulary
build_vocab(tokenizer, [train_data[0]])
import os
import pickle

# Create embedding matrix from pretrained w2v
embedding_matrix = create_embedding_matrix(w2v_model, tokenizer.vocab, "embedding_matrix.dat")
options = {
    "on_gpu": True,
    "batch_size": 16,
    "num_workers": 2
}
# Create DataModule
datamodule = AIVIVN(tokenizer, options)
from pytorch_lightning.callbacks import ModelCheckpoint
checkpoint_callback = ModelCheckpoint(
    monitor='val_acc', # save the model with the best validation accuracy
    dirpath='checkpoints',
    mode='max',
)

# Set hyper-parameters
lr = 1e-3 
num_epochs = 20
l2reg = 1e-5
dropout = 0.0

trainer = pl.Trainer(gpus=1, max_epochs=num_epochs, callbacks=[checkpoint_callback], deterministic=True)
# trainer = pl.Trainer(fast_dev_run=True, gpus=1) #Debug 
# trainer = pl.Trainer(overfit_batches=0.1, max_epochs=num_epochs, gpus=1) #Debug
model = TextCNN(embedding_matrix, lr=lr, l2reg=l2reg, dropout=dropout)
trainer.fit(model, datamodule)
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name       | Type       | Params
------------------------------------------
0 | embedding  | Embedding  | 1.2 M 
1 | filters    | ModuleList | 153 K 
2 | conv_pool1 | ConvPool   | 82.0 K
3 | conv_pool2 | ConvPool   | 82.0 K
4 | flatten    | Flatten    | 0     
5 | linear1    | Linear     | 32.9 K
6 | linear2    | Linear     | 258   
7 | train_acc  | Accuracy   | 0     
8 | val_acc    | Accuracy   | 0     
9 | val_f1     | F1         | 0     
------------------------------------------
351 K     Trainable params
1.2 M     Non-trainable params
1.6 M     Total params
6.283     Total estimated model params size (MB)

Save model and tokenizer for inference

# Load best model from training
new_model = TextCNN.load_from_checkpoint('/content/checkpoints/epoch=1-step=1709.ckpt', embeddings=embedding_matrix)
# Test the loaded model with the validation set to double check
trainer.test(my_model, datamodule.val_dataloader())
/usr/local/lib/python3.7/dist-packages/pytorch_lightning/core/datamodule.py:424: LightningDeprecationWarning: DataModule.setup has already been called, so it will not be called again. In v1.6 this behavior will change to always call DataModule.setup.
  f"DataModule.{name} has already been called, so it will not be called again. "
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
--------------------------------------------------------------------------------
DATALOADER:0 TEST RESULTS
{'test_acc': 0.8881524205207825, 'test_f1': 0.8867566585540771}
--------------------------------------------------------------------------------
/usr/local/lib/python3.7/dist-packages/pytorch_lightning/core/datamodule.py:424: LightningDeprecationWarning: DataModule.teardown has already been called, so it will not be called again. In v1.6 this behavior will change to always call DataModule.teardown.
  f"DataModule.{name} has already been called, so it will not be called again. "
[{'test_acc': 0.8881524205207825, 'test_f1': 0.8867566585540771}]
# Save tokenizer
import pickle
with open('tokenizer.pkl', 'wb') as outp:
    pickle.dump(tokenizer, outp, pickle.HIGHEST_PROTOCOL)
# Save entire model
torch.save(new_model, "model")

Inference

To do the inference, we have to do 2 steps: 1. Loading model and the tokenizer. 2. Define the preprocessing function to preprocess the input before feeding into the model. 3. (Optional) Convert the predictions to labels.

import torch.nn.functional as F
inputs = [":(( Mình hơi thất vọng 1 chút vì mình đã kỳ vọng cuốn sách khá nhiều hi vọng nó sẽ nói về việc học tập của cách sinh viên trường Harvard ra sao những nỗ lực của họ như thế nào 4h sáng? tại sao họ lại phải thức dậy vào thời khắc đấy? sau đó là cả một câu chuyện ra sao. Cái mình thực sự cần ở đây là câu chuyện ẩn dấu trong đó để tự bản thân mỗi người cảm nhận và đi sâu vào lòng người hơn. Còn cuốn sách này chỉ đơn thuần là cuốn sách dạy kĩ năng mà hầu như sách nào cũng đã có. BUồn...", "Chất lượng sản phẩm tuyệt vời nhưng k có hộp k có dây giày đen k có tất"]
# preprocess input
def _preprocess_data_for_inference(sentences: List[str], tokenizer: Tokenizer):
    sequences = tokenizer.texts_to_sequences(sentences, tensor=True)
    # pad sequences
    sequences = torch.stack([F.pad(seq, (0, 557 - len(seq)), 'constant', 0) for seq in sequences])
    return {"sequence": sequences}
input_data = _preprocess_data_for_inference(inputs, tokenizer)
new_model.eval()
predictions = new_model(input_data)
torch.argmax(predictions, axis=-1)
tensor([1, 0])
train_df.iloc[3]['text']
'":(( Mình hơi thất vọng 1 chút vì mình đã kỳ vọng cuốn sách khá nhiều hi vọng nó sẽ nói về việc học tập của cách sinh viên trường Harvard ra sao những nỗ lực của họ như thế nào 4h sáng? tại sao họ lại phải thức dậy vào thời khắc đấy? sau đó là cả một câu chuyện ra sao. Cái mình thực sự cần ở đây là câu chuyện ẩn dấu trong đó để tự bản thân mỗi người cảm nhận và đi sâu vào lòng người hơn. Còn cuốn sách này chỉ đơn thuần là cuốn sách dạy kĩ năng mà hầu như sách nào cũng đã có. BUồn..."'

Debug

# Random check the pretrain word embeddings
A = embedding_matrix[tokenizer.vocab.get_stoi()['ăn_nằm']]
B = w2v_model["ăn_nằm"]
np.array_equal(A,B)

Lesson Learned

  • Make sure to check the train datamodule when debugging the model.
  • Don’t just copy the code from previous code. Make sure to read through it before using to mitigate hard-to-see bugs. In this post, the bug lies in the way we load the pretrained word embeddings.