使用 torchtext 做 Toxic Comment Classification 比賽的數據預處理

Toxic Comment Classification Challenge?

www.kaggle.com

torchtext 結構圖

本篇文章主要介紹的是如何使用 torchtext 做自然語言處理任務的數據預處理部分, 包含

  • 如何定義 Field
  • 自定義 Dataset
  • 如何創建 Iterator

如何定義 Field

在 torchtext 中, Field主要做以下幾個任務:

  • tokenization(創建Example時): "hello world." --> ["hello", "world", "."]
  • 構建 Vocab
  • pad(創建Batch時): ["hello", "world", "."] --> ["hello", "world", ".", "<pad>", "<pad>"]因為每個 Example 的長度不一定相等, 需要 pad 成相同長度才可以 batch 起來.
  • neumericalize(pad之後的操作, 需要有Vocab介入): ["hello", "world", ".", "<pad>", "<pad>"] --> [2, 3, 4, 0, 0]

以上三個任務, pad 和 neumericalize 一般情況下是不需要用戶自己編寫的.構建 Vocab 也僅僅需要幾行就可以搞定, 我們需要編寫的代碼大部分在於 tokenization.

# Toxic Comment Classification 比賽的 文本域(text Field).import spacyfrom torchtext import dataspacy_en = spacy.load(en)def tokenize(text): fileters = !"#$%&()*+,-./:;<=>?@[\]^_`{|}~
trans_map = str.maketrans(fileters, " " * len(fileters)) text = text.translate(trans_map) text = [tok.text for tok in spacy_en.tokenizer(text) if tok.text != ] tokenized_text = [] auxiliary_verbs = [am, is, are, was, were, "s"] for token in text: if token == "nt": tmp = not elif token == "ll": tmp = will elif token in auxiliary_verbs: tmp = be else: tmp = token tokenized_text.append(tmp) return tokenized_text# 這裡需要注意的是, truncate_first 這個參數在舊版本是沒有, 我提交了pr, 已經被 merge# 不久後更新就可以用了.TEXT = data.Field(tokenize=tokenize, lower=True, batch_first=True, fix_length=100, truncate_first=True)# 由於此比賽的 標籤是 [1,0,0,1,1,1] 這種的, 不是序列數據, 也不需要用 Vocab,# 所以使用如下方法定義LABEL = data.Field(sequential=False, use_vocab=False, batch_first=True, tensor_type=torch.FloatTensor)

構建 Vocab

vectors = Vectors(name=glove.6B.300d.txt, cache=.vector_cache/)# 當 corpus 中有的 token 在 vectors 中不存在時 的初始化方式.vectors.unk_init = init.xavier_uniform# 載入 預訓練好的 word embeddingTEXT.build_vocab(train, vectors=vectors, max_size=30000)

如何定義Dataset

自定義Dataset要乾的主要事情就是, 將Corpus製作成 Examples 列表.

class CustomDataset(data.Dataset): name = toxic comment dirname = @staticmethod def sort_key(ex): return len(ex.text) def __init__(self, path, text_field, label_field, test=False, **kwargs): fields = [(text, text_field), (label, label_field)] examples = [] csv_data = pd.read_csv(path) print("preparing examples...") for i in tqdm(range(len(csv_data))): sample = csv_data.loc[i] text, label = self.process_csv_line(sample, test) examples.append(data.Example.fromlist([text, label], fields)) super(CustomDataset, self).__init__(examples, fields, **kwargs) # 用來快速創建多個 Dataset 的方法. @classmethod def splits(cls, root=./data, train=train.csv, test=test.csv, **kwargs): return super(CustomDataset, cls).splits( root=root, train=train, test=test, **kwargs) def process_csv_line(self, sample, test): text = sample["comment_text"] text = text.replace(
, ) label = None if not test: label = [v for v in map(int, sample[["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]])] return text, label

如何創建 Iterator

torchtext 提供了多種 Iterator 可供使用, 這裡主要使用了

  • Iterator
  • BucketIterator

train_iter = data.Iterator(dataset=train, batch_size=32, train=True, repeat=False, device=0 if using_gpu else -1)# 在 test_iter , sort一定要設置成 False, 要不然會被 torchtext 搞亂樣本順序test_iter = data.Iterator(dataset=test, batch_size=64, train=False, sort=False, device=0 if using_gpu else -1)

完整數據預處理部分代碼

import spacyfrom torchtext import datafrom tqdm import tqdmimport pandas as pdimport torchfrom torchtext.vocab import Vectorsfrom torch.nn import initspacy_en = spacy.load(en)class CustomDataset(data.Dataset): name = toxic comment dirname = @staticmethod def sort_key(ex): return len(ex.text) def __init__(self, path, text_field, label_field, test=False, **kwargs): fields = [(text, text_field), (label, label_field)] examples = [] csv_data = pd.read_csv(path) print("preparing examples...") for i in tqdm(range(len(csv_data))): sample = csv_data.loc[i] text, label = self.process_csv_line(sample, test) examples.append(data.Example.fromlist([text, label], fields)) super(CustomDataset, self).__init__(examples, fields, **kwargs) @classmethod def splits(cls, root=./data, train=train.csv, test=test.csv, **kwargs): return super(CustomDataset, cls).splits( root=root, train=train, test=test, **kwargs) def process_csv_line(self, sample, test): text = sample["comment_text"] text = text.replace(
, ) label = None if not test: label = [v for v in map(int, sample[["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]])] return text, labeldef prepare_data_and_model(Model, using_gpu=True): train_path = "./data/train.csv" test_path = "./data/test.csv" def tokenize(text): fileters = !"#$%&()*+,-./:;<=>?@[\]^_`{|}~
trans_map = str.maketrans(fileters, " " * len(fileters)) text = text.translate(trans_map) text = [tok.text for tok in spacy_en.tokenizer(text) if tok.text != ] tokenized_text = [] auxiliary_verbs = [am, is, are, was, were, "s"] for token in text: if token == "nt": tmp = not elif token == "ll": tmp = will elif token in auxiliary_verbs: tmp = be else: tmp = token tokenized_text.append(tmp) return tokenized_text TEXT = data.Field(tokenize=tokenize, lower=True, batch_first=True, fix_length=100, truncate_first=True) LABEL = data.Field(sequential=False, use_vocab=False, batch_first=True, tensor_type=torch.FloatTensor) train = CustomDataset(train_path, text_field=TEXT, label_field=LABEL) test = CustomDataset(test_path, text_field=TEXT, label_field=None, test=True) vectors = Vectors(name=glove.6B.300d.txt, cache=.vector_cache/) vectors.unk_init = init.xavier_uniform TEXT.build_vocab(train, vectors=vectors, max_size=30000) print(train.fields, train.fields) print(train.name, getattr(train, text)) print(len(train), len(train)) print(vars(train[0]), vars(train[0])) # using the training corpus to create the vocabulary train_iter = data.Iterator(dataset=train, batch_size=32, train=True, repeat=False, device=0 if using_gpu else -1) test_iter = data.Iterator(dataset=test, batch_size=64, train=False, sort=False, device=0 if using_gpu else -1) num_tokens = len(TEXT.vocab.itos) num_classes = 6 net = Model(embedding_size=300, num_tokens=num_tokens, num_classes=num_classes) net.embedding.weight.data.copy_(TEXT.vocab.vectors) if using_gpu: net.cuda() return train_iter, test_iter, net

KeithYin/ToxicCommentClassification-pytorch?

github.com圖標
推薦閱讀:

NNLM最新論文調研-1-《Character-Aware Neural Language Models》
《Dialogue Act Sequence Labeling using Hierarchical encoder with CRF》閱讀筆記
論文筆記 - Semi-Supervised QA with Generative Domain-Adaptive Nets
第二節 自然語言處理的框架與流程
關於神經網路參數初始化為全0的思考

TAG:自然語言處理 |