from google.colab import drive
'/content/drive') drive.mount(
Mounted at /content/drive
August 17, 2021
The data contains user’s reviews following two categories: “positive” and “negative”. There are 27068 sentences in total. * Train: 16087 sentences * Test: 10981 sentences (public: 5454 sentences, private: 5527 sentences) * Labels: 0 (positive), 1 (negative)
You can download the dataset here.
def split_array(arr, condition):
if len(arr) == 0:
return []
result = []
accumulated = [arr[0]]
for ele in arr[1:]:
if condition(ele):
result.append(copy.deepcopy(accumulated))
accumulated = [copy.deepcopy(ele)]
else:
accumulated.append(copy.deepcopy(ele))
result.append(copy.deepcopy(accumulated))
return result
def read_file(file_path, is_train=True):
file_path = abspath(file_path)
data_lines = list(
filter(lambda x: x != '', open(file_path).read().split('\n')))
pattern = ('train' if is_train else 'test') + '_[0-9]{5}'
datas = split_array(data_lines, lambda x: bool(re.match(pattern, x)))
if is_train:
result_array = list(map(
lambda x: [x[0], ' '.join(x[1:-1]), int(x[-1])], datas))
else:
result_array = list(map(lambda x: [x[0], ' '.join(x[1:])], datas))
columns = ['name', 'text', 'label'] if is_train else ['name', 'text']
return pd.DataFrame(result_array, columns=columns)
name | text | label | |
---|---|---|---|
0 | train_000000 | "Dung dc sp tot cam on shop Đóng gói sản phẩm... | 0 |
1 | train_000001 | " Chất lượng sản phẩm tuyệt vời . Son mịn nhưn... | 0 |
2 | train_000002 | " Chất lượng sản phẩm tuyệt vời nhưng k có hộp... | 0 |
3 | train_000003 | ":(( Mình hơi thất vọng 1 chút vì mình đã kỳ v... | 1 |
4 | train_000004 | "Lần trước mình mua áo gió màu hồng rất ok mà ... | 1 |
name | text | |
---|---|---|
0 | test_000000 | "Chưa dùng thử nên chưa biết" |
1 | test_000001 | " Không đáng tiềnVì ngay đợt sale nên mới mua ... |
2 | test_000002 | "Cám ơn shop. Đóng gói sản phẩm rất đẹp và chắ... |
3 | test_000003 | "Vải đẹp.phom oki luôn.quá ưng" |
4 | test_000004 | "Chuẩn hàng đóng gói đẹp" |
from typing import List, Tuple
import torchtext
from collections import Counter, OrderedDict
from torch.nn.utils.rnn import pad_sequence
from torchtext.vocab import Vectors, Vocab
class Tokenizer():
def __init__(self, tokenizer: Any):
self.counter = Counter(['<pad>', '<unk>'])
self.tokenizer = tokenizer
self.vocab = None
self.update_vocab()
def update_vocab(self):
# sorted_by_freq_tuples = sorted(self.counter.items()[2:], key=lambda x: x[1], reverse=True)
ordered_dict = OrderedDict(self.counter.items())
self.vocab = torchtext.vocab.vocab(ordered_dict, min_freq=1)
def fit_on_texts(self, texts: List[str]):
"""
Updates internal vocabulary based on a list of texts.
"""
for text in texts:
tokens = [t.text for t in self.tokenizer(text)]
self.counter.update(tokens)
self.update_vocab()
def texts_to_sequences(self, texts: List[str], tensor: bool=True) -> List[int]:
word2idx = self.vocab.get_stoi()
sequences = []
for text in texts:
seq = [word2idx.get(token.text, word2idx['<unk>']) for token in self.tokenizer(text)]
if tensor:
seq = torch.tensor(seq)
sequences.append(seq)
return sequences
def _load_data_from(data_path: Union[str, Path]):
df = read_file(data_path)
sents = list(df['text'].str.strip().str.lower())
sentiments = list(df['label'])
return sents, sentiments
def _save_to_csv(file_path: Union[str, Path], data):
sents, sentiments = data
df = pd.DataFrame({
"text": sents,
"label": sentiments,
})
df.to_csv(file_path, index=False)
return file_path
def _preprocess_data(data: Tuple[List[str], List[str]], tokenizer: Tokenizer):
sentences, sentiments = data
sequences = tokenizer.texts_to_sequences(sentences)
sentiment_tensor = torch.tensor(sentiments)
# pad sequences
sequences = pad_sequence(sequences, batch_first=True)
assert len(sequences) == len(sentiments)
all_data = []
for i in range(len(sentiments)):
sample = {
'sequence': sequences[i],
'sentiment': sentiment_tensor[i]
}
all_data.append(sample)
return all_data
def build_vocab(tokenizer, data):
sentences = data[0]
tokenizer.fit_on_texts(sentences)
from gensim.models import KeyedVectors
from gensim.test.utils import datapath
import numpy as np
def load_pretrained_word_embeddings(w2v_path: str):
return KeyedVectors.load_word2vec_format(datapath(w2v_path), binary=False)
def create_embedding_matrix(w2v_model, vocab: Vocab, path: Union[str, Path]):
if os.path.exists(path):
print(f'loading embedding matrix from {path}')
embedding_matrix = pickle.load(open(path, 'rb'))
else:
# Calculate vector for OOV token
OOV_vec = torch.from_numpy(np.mean(w2v_model.vectors, axis=0))
embedding_matrix = torch.zeros((len(vocab), w2v_model.vector_size),
dtype=torch.float)
# words that are not availabel in the pretrained word embeddings will be zeros
for word, index in vocab.get_stoi().items():
if word in w2v_model.vocab:
embedding_matrix[index] = torch.from_numpy(w2v_model[word])
else:
if word == "<pad>":
continue
embedding_matrix[index] = OOV_vec
# save embedding matrix
pickle.dump(embedding_matrix, open(path, 'wb'))
return embedding_matrix
Now we create the datamodule class for the dataset using Pytorch-Lightning Framework. You can read more here.
import torch
from torch.utils.data import DataLoader, Dataset, random_split
import pytorch_lightning as pl
class AIVIVNDataset(Dataset):
def __init__(self, data):
self.data = data
def __len__(self):
return len(self.data)
def __getitem__(self, idx):
return self.data[idx]
class AIVIVN(pl.LightningDataModule):
def __init__(self, tokenizer, opts: Dict[str, Any]):
super().__init__()
self.tokenizer = tokenizer
self.batch_size = opts['batch_size']
self.num_workers = opts['num_workers']
self.on_gpu = opts['on_gpu']
self.train_ds = None
self.val_ds = None
self.mapping = {"negative": 1, "positive": 0}
self.inverse_mapping = {v: k for k, v in enumerate(self.mapping)}
def prepare_data(self, *args, **kwargs) -> None:
self.train_path = '/content/drive/MyDrive/SLSOPS/dataset/Aivivn_vietnamese_dataset/train.crash'
def setup(self, stage: str = None) -> None:
if stage == "fit" or stage is None:
# Load data from files
train_data = _load_data_from(self.train_path)
preprocessed_data = _preprocess_data(train_data, self.tokenizer)
dataset = AIVIVNDataset(preprocessed_data)
lengths = [int(len(dataset)*0.85), len(dataset) - int(len(dataset)*0.85)]
self.train_ds, self.val_ds = random_split(dataset, lengths)
def train_dataloader(self):
return DataLoader(
self.train_ds,
shuffle=True,
batch_size=self.batch_size,
num_workers=self.num_workers,
pin_memory=self.on_gpu
)
def val_dataloader(self):
return DataLoader(
self.val_ds,
shuffle=False,
batch_size=self.batch_size,
num_workers=self.num_workers,
pin_memory=self.on_gpu,
)
def __repr__(self):
basic = f"AIVIVN Product Review Dataset\nNum classes: {len(self.mapping)}\nMapping: {self.mapping}\n"
if self.train_ds is None and self.val_ds is None:
return basic
batch = next(iter(self.train_dataloader()))
sequences, sentiments = batch['sequence'], batch['sentiment']
data = (
f"Train/val sizes: {len(self.train_ds)}, {len(self.val_ds)}\n"
f"Batch sequences stats: {(sequences.shape, sequences.dtype)}\n"
f"Batch sentiments stats: {(sentiments.shape, sentiments.dtype)}\n"
)
return basic + data
import torch.nn as nn
import torch.nn.functional as F
class ConvPool(nn.Module):
def __init__(self, in_channels, out_channels, conv_kernel_sz, pool_kernel_sz):
super(ConvPool, self).__init__()
self.conv = nn.Conv1d(in_channels=in_channels, out_channels=out_channels, kernel_size=conv_kernel_sz)
self.pool = nn.MaxPool1d(pool_kernel_sz)
def forward(self, x):
out = self.conv(x)
out = F.relu(out)
out = self.pool(out)
return out
class TextCNN(pl.LightningModule):
def __init__(self, embeddings, num_classes=2, batch_first=True, lr=1e-3, dropout=0, l2reg=0.01):
super().__init__()
embedding_dim = embeddings.shape[1]
self.embedding = nn.Embedding.from_pretrained(embeddings)
kernel_sizes = [3,4,5]
self.filters = nn.ModuleList([ConvPool(embedding_dim, 128, conv_kernel_sz=conv_kernel_size, pool_kernel_sz=5) for conv_kernel_size in kernel_sizes])
self.conv_pool1 = ConvPool(128, 128, 5, 5)
self.conv_pool2 = ConvPool(128, 128, 5, 30)
self.flatten = nn.Flatten(start_dim=1)
self.linear1 = nn.Linear(256, 128)
self.linear2 = nn.Linear(128, num_classes)
self.lr = lr
self.l2reg = l2reg
self.train_acc = torchmetrics.Accuracy()
self.val_acc = torchmetrics.Accuracy()
self.val_f1 = torchmetrics.F1(num_classes=2, average='macro')
self.test_acc = torchmetrics.Accuracy()
self.test_f1 = torchmetrics.F1(num_classes=2, average='macro')
def configure_optimizers(self):
optim = torch.optim.Adam(self.parameters(), lr=self.lr)
return optim
def forward(self, input):
sequences = input['sequence'] # BxS
embeds = self.embedding(sequences).permute(0, 2, 1) # BxSxH
out_1 = self.filters[0](embeds)
out_2 = self.filters[1](embeds)
out_3 = self.filters[2](embeds)
out = torch.cat((out_1, out_2, out_3), dim=2)
out = self.conv_pool1(out)
out = self.conv_pool2(out)
out = self.flatten(out)
out = self.linear1(out)
out = F.relu(out)
logit = self.linear2(out)
return logit
def training_step(self, batch, batch_idx):
sentiments = batch['sentiment']
logits = self.forward(batch)
loss = F.cross_entropy(logits, sentiments)
scores = F.softmax(logits, dim=-1)
self.train_acc(scores, sentiments)
self.log('train_loss', loss, on_step=True, on_epoch=True, prog_bar=True)
self.log('train_acc', self.train_acc, on_step=False, on_epoch=True, prog_bar=True, logger=True)
return loss
def validation_step(self, batch, batch_idx): # pylint: disable=unused-argument
sentiments = batch['sentiment']
logits = self.forward(batch)
scores = F.softmax(logits, dim=-1)
self.val_acc(scores, sentiments)
self.val_f1(scores, sentiments)
self.log('val_loss', loss, on_step=True, on_epoch=True, prog_bar=True)
self.log('val_acc', self.val_acc, on_step=False, on_epoch=True, prog_bar=True, logger=True)
self.log('val_f1', self.val_f1, on_step=False, on_epoch=True, prog_bar=True, logger=True)
def test_step(self, batch, batch_idx): # pylint: disable=unused-argument
sentiments = batch['sentiment']
logits = self.forward(batch)
scores = F.softmax(logits, dim=-1)
self.test_acc(scores, sentiments)
self.test_f1(scores, sentiments)
self.log('test_acc', self.test_acc, on_step=False, on_epoch=True, logger=True)
self.log('test_f1', self.test_f1, on_step=False, on_epoch=True, logger=True)
nn.ModuleList does not have a forward() method because it does not define any neural network, that is, there is no connection between each of the nn.Module’s that it stores. You may use it to store nn.Module’s, just like you use Python lists to store other types of objects (integers, strings, etc).
The advantage of using nn.ModuleList instead of using conventional Python lists to store nn.Module’s is that Pytorch is “aware” of the existence of the nn.Module’s inside an nn.ModuleList, which is not the case for Python lists. When using a Python list instead of a nn.ModuleList, the optimizer() will raise the error saying that the model has no parameters. This is because PyTorch does not see the parameters of the layers stored in a Python list. If you use a nn.ModuleList instead, you’ll get no error.
from pytorch_lightning.callbacks import ModelCheckpoint
checkpoint_callback = ModelCheckpoint(
monitor='val_acc', # save the model with the best validation accuracy
dirpath='checkpoints',
mode='max',
)
# Set hyper-parameters
lr = 1e-3
num_epochs = 20
l2reg = 1e-5
dropout = 0.0
trainer = pl.Trainer(gpus=1, max_epochs=num_epochs, callbacks=[checkpoint_callback], deterministic=True)
# trainer = pl.Trainer(fast_dev_run=True, gpus=1) #Debug
# trainer = pl.Trainer(overfit_batches=0.1, max_epochs=num_epochs, gpus=1) #Debug
model = TextCNN(embedding_matrix, lr=lr, l2reg=l2reg, dropout=dropout)
trainer.fit(model, datamodule)
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
| Name | Type | Params
------------------------------------------
0 | embedding | Embedding | 1.2 M
1 | filters | ModuleList | 153 K
2 | conv_pool1 | ConvPool | 82.0 K
3 | conv_pool2 | ConvPool | 82.0 K
4 | flatten | Flatten | 0
5 | linear1 | Linear | 32.9 K
6 | linear2 | Linear | 258
7 | train_acc | Accuracy | 0
8 | val_acc | Accuracy | 0
9 | val_f1 | F1 | 0
------------------------------------------
351 K Trainable params
1.2 M Non-trainable params
1.6 M Total params
6.283 Total estimated model params size (MB)
# Test the loaded model with the validation set to double check
trainer.test(my_model, datamodule.val_dataloader())
/usr/local/lib/python3.7/dist-packages/pytorch_lightning/core/datamodule.py:424: LightningDeprecationWarning: DataModule.setup has already been called, so it will not be called again. In v1.6 this behavior will change to always call DataModule.setup.
f"DataModule.{name} has already been called, so it will not be called again. "
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
--------------------------------------------------------------------------------
DATALOADER:0 TEST RESULTS
{'test_acc': 0.8881524205207825, 'test_f1': 0.8867566585540771}
--------------------------------------------------------------------------------
/usr/local/lib/python3.7/dist-packages/pytorch_lightning/core/datamodule.py:424: LightningDeprecationWarning: DataModule.teardown has already been called, so it will not be called again. In v1.6 this behavior will change to always call DataModule.teardown.
f"DataModule.{name} has already been called, so it will not be called again. "
[{'test_acc': 0.8881524205207825, 'test_f1': 0.8867566585540771}]
To do the inference, we have to do 2 steps: 1. Loading model and the tokenizer. 2. Define the preprocessing function to preprocess the input before feeding into the model. 3. (Optional) Convert the predictions to labels.
import torch.nn.functional as F
inputs = [":(( Mình hơi thất vọng 1 chút vì mình đã kỳ vọng cuốn sách khá nhiều hi vọng nó sẽ nói về việc học tập của cách sinh viên trường Harvard ra sao những nỗ lực của họ như thế nào 4h sáng? tại sao họ lại phải thức dậy vào thời khắc đấy? sau đó là cả một câu chuyện ra sao. Cái mình thực sự cần ở đây là câu chuyện ẩn dấu trong đó để tự bản thân mỗi người cảm nhận và đi sâu vào lòng người hơn. Còn cuốn sách này chỉ đơn thuần là cuốn sách dạy kĩ năng mà hầu như sách nào cũng đã có. BUồn...", "Chất lượng sản phẩm tuyệt vời nhưng k có hộp k có dây giày đen k có tất"]
# preprocess input
def _preprocess_data_for_inference(sentences: List[str], tokenizer: Tokenizer):
sequences = tokenizer.texts_to_sequences(sentences, tensor=True)
# pad sequences
sequences = torch.stack([F.pad(seq, (0, 557 - len(seq)), 'constant', 0) for seq in sequences])
return {"sequence": sequences}
input_data = _preprocess_data_for_inference(inputs, tokenizer)
new_model.eval()
predictions = new_model(input_data)
'":(( Mình hơi thất vọng 1 chút vì mình đã kỳ vọng cuốn sách khá nhiều hi vọng nó sẽ nói về việc học tập của cách sinh viên trường Harvard ra sao những nỗ lực của họ như thế nào 4h sáng? tại sao họ lại phải thức dậy vào thời khắc đấy? sau đó là cả một câu chuyện ra sao. Cái mình thực sự cần ở đây là câu chuyện ẩn dấu trong đó để tự bản thân mỗi người cảm nhận và đi sâu vào lòng người hơn. Còn cuốn sách này chỉ đơn thuần là cuốn sách dạy kĩ năng mà hầu như sách nào cũng đã có. BUồn..."'