构建于Ubuntu对话数据集上的基于检索的Chatbot

数据

完整的数据可以在Google Drive文件夹中找到:https://drive.google.com/open?id=1RIIbsS-vxR7Dlo2_v6FWHDFE7q1XPPgj
要复现文档中的代码,需要执行以下操作:
1) 下载 以下文件:

  • glove.6B.50d.txt (Subfolder GloVe)
  • training_10000.csv (Subfolder MAIN FILES)
  • validation_1000.csv (Subfolder MAIN FILES)
  • testing_same_structure_1000.csv (Subfolder MAIN FILES)
  • testing_different_structure_100.csv (Subfolder MAIN FILES)
  • saved_model_10000_gpu.pt (Subfolder SAVED MODELS)

2) 调整变量大小 :对于代码中出现的 num_training_examples, num_validation_examples, embedding_dim, test_dataframe_same_structure, test_dataframe_different_structure 和saved model file name 可以根据数据量的大小进行调整

3) 调整超参数设置:具体模型的参数大家可以自己调整,也可以参考SAVED MODELS文件夹下的内容。

代码

相关库
1
2
3
4
5
6
7
8
9
10
import pandas as pd
import numpy as np
import torch.nn as nn
import torch
import torch.autograd as autograd
from torch.nn import init
import torch.nn.utils.rnn
import datetime
import operator
np.random.seed(0)
定义helper函数以构建训练和验证过程中的变量
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
def create_dataframe(csvfile):
dataframe = pd.read_csv(csvfile)
return dataframe

def shuffle_dataframe(dataframe):
dataframe.reindex(np.random.permutation(dataframe.index))

def create_vocab(dataframe):
vocab = []
word_freq = {}

for index, row in dataframe.iterrows():
context_cell = row["Context"]
response_cell = row["Utterance"]
train_words = str(context_cell).split() + str(response_cell).split()

for word in train_words:
if word.lower() not in vocab:
vocab.append(word.lower())

if word.lower() not in word_freq:
word_freq[word.lower()] = 1
else:
word_freq[word] += 1

word_freq_sorted = sorted(word_freq.items(), key=lambda item: item[1], reverse=True)
vocab = ["<UNK>"] + [pair[0] for pair in word_freq_sorted]

return vocab


def create_word_to_id(vocab):
word_to_id = {word: id for id, word in enumerate(vocab)}

return word_to_id


def create_id_to_vec(word_to_id, glovefile):
lines = open(glovefile, 'r').readlines()
id_to_vec = {}
vector = None

for line in lines:
word = line.split()[0]
vector = np.array(line.split()[1:], dtype='float32') #32

if word in word_to_id:
id_to_vec[word_to_id[word]] = torch.FloatTensor(torch.from_numpy(vector))

for word, id in word_to_id.items():
if word_to_id[word] not in id_to_vec:
v = np.zeros(*vector.shape, dtype='float32')
v[:] = np.random.randn(*v.shape)*0.01
id_to_vec[word_to_id[word]] = torch.FloatTensor(torch.from_numpy(v))

embedding_dim = id_to_vec[0].shape[0]

return id_to_vec, embedding_dim


def load_ids_and_labels(row, word_to_id):
context_ids = []
response_ids = []

context_cell = row['Context']
response_cell = row['Utterance']
label_cell = row['Label']

max_context_len = 160

context_words = context_cell.split()
if len(context_words) > max_context_len:
context_words = context_words[:max_context_len]
for word in context_words:
if word in word_to_id:
context_ids.append(word_to_id[word])
else:
context_ids.append(0) #UNK

response_words = response_cell.split()
for word in response_words:
if word in word_to_id:
response_ids.append(word_to_id[word])
else:
response_ids.append(0)

label = np.array(label_cell).astype(np.float32)

return context_ids, response_ids, label
模型定义
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
class Encoder(nn.Module):

def __init__(self,
emb_size,
hidden_size,
vocab_size,
p_dropout):

super(Encoder, self).__init__()

self.emb_size = emb_size
self.hidden_size = hidden_size
self.vocab_size = vocab_size
self.p_dropout = p_dropout

self.embedding = nn.Embedding(self.vocab_size, self.emb_size)
self.lstm = nn.LSTM(self.emb_size, self.hidden_size)
self.dropout_layer = nn.Dropout(self.p_dropout)

self.init_weights()

def init_weights(self):
init.uniform(self.lstm.weight_ih_l0, a = -0.01, b = 0.01)
init.orthogonal(self.lstm.weight_hh_l0)
self.lstm.weight_ih_l0.requires_grad = True
self.lstm.weight_hh_l0.requires_grad = True

embedding_weights = torch.FloatTensor(self.vocab_size, self.emb_size)

for id, vec in id_to_vec.items():
embedding_weights[id] = vec

self.embedding.weight = nn.Parameter(embedding_weights, requires_grad = True)

def forward(self, inputs):
embeddings = self.embedding(inputs)
_, (last_hidden, _) = self.lstm(embeddings) #dimensions: (num_layers * num_directions x batch_size x hidden_size)
last_hidden = self.dropout_layer(last_hidden[-1])#access last lstm layer, dimensions: (batch_size x hidden_size)

return last_hidden


class DualEncoder(nn.Module):

def __init__(self, encoder):
super(DualEncoder, self).__init__()
self.encoder = encoder
self.hidden_size = self.encoder.hidden_size
M = torch.FloatTensor(self.hidden_size, self.hidden_size)
init.xavier_normal(M)
self.M = nn.Parameter(M, requires_grad = True)

def forward(self, context_tensor, response_tensor):

context_last_hidden = self.encoder(context_tensor) #dimensions: (batch_size x hidden_size)
response_last_hidden = self.encoder(response_tensor) #dimensions: (batch_size x hidden_size)

#context = context_last_hidden.mm(self.M).cuda()
context = context_last_hidden.mm(self.M) #dimensions: (batch_size x hidden_size)
context = context.view(-1, 1, self.hidden_size) #dimensions: (batch_size x 1 x hidden_size)

response = response_last_hidden.view(-1, self.hidden_size, 1) #dimensions: (batch_size x hidden_size x 1)

#score = torch.bmm(context, response).view(-1, 1).cuda()
score = torch.bmm(context, response).view(-1, 1) #dimensions: (batch_size x 1 x 1) and lastly --> (batch_size x 1)

return score
数据与变量构建

定义函数去调用所有的helper函数,以便完成各种数据和变量初始化,以及部分的预训练词向量加载等.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
def creating_variables(num_training_examples, num_validation_examples, embedding_dim):

print(str(datetime.datetime.now()).split('.')[0], "Creating variables for training and validation...")

training_dataframe = create_dataframe('training_%d.csv' %num_training_examples)
vocab = create_vocab(training_dataframe)
word_to_id = create_word_to_id(vocab)
id_to_vec, emb_dim = create_id_to_vec(word_to_id, 'glove.6B.%dd.txt' %embedding_dim)

validation_dataframe = create_dataframe('validation_%d.csv' %num_validation_examples)

print(str(datetime.datetime.now()).split('.')[0], "Variables created.\n")

return training_dataframe, vocab, word_to_id, id_to_vec, emb_dim, validation_dataframe

模型构建

调用Encoder和DualEncoder去构建模型.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
def creating_model(hidden_size, p_dropout):

print(str(datetime.datetime.now()).split('.')[0], "Calling model...")

encoder = Encoder(
emb_size = emb_dim,
hidden_size = hidden_size,
vocab_size = len(vocab),
p_dropout = p_dropout)

dual_encoder = DualEncoder(encoder)

print(str(datetime.datetime.now()).split('.')[0], "Model created.\n")
print(dual_encoder)

return encoder, dual_encoder

训练集和验证集准确率计算
1
2
3
4
5
6
7
8
9
10
def increase_count(correct_count, score, label):
if ((score.data[0][0] >= 0.5) and (label.data[0][0] == 1.0)) or ((score.data[0][0] < 0.5) and (label.data[0][0] == 0.0)):
correct_count +=1

return correct_count

def get_accuracy(correct_count, dataframe):
accuracy = correct_count/(len(dataframe))

return accuracy
构建模型训练函数
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
def train_model(learning_rate, l2_penalty, epochs): 
print(str(datetime.datetime.now()).split('.')[0], "Starting training and validation...\n")
print("====================Data and Hyperparameter Overview====================\n")
print("Number of training examples: %d, Number of validation examples: %d" %(len(training_dataframe), len(validation_dataframe)))
print("Learning rate: %.5f, Embedding Dimension: %d, Hidden Size: %d, Dropout: %.2f, L2:%.10f\n" %(learning_rate, emb_dim, encoder.hidden_size, encoder.p_dropout, l2_penalty))
print("================================Results...==============================\n")

optimizer = torch.optim.Adam(dual_encoder.parameters(), lr = learning_rate, weight_decay = l2_penalty)

loss_func = torch.nn.BCEWithLogitsLoss()
#loss_func.cuda()

best_validation_accuracy = 0.0

for epoch in range(epochs):

shuffle_dataframe(training_dataframe)

sum_loss_training = 0.0

training_correct_count = 0

dual_encoder.train()

for index, row in training_dataframe.iterrows():

context_ids, response_ids, label = load_ids_and_labels(row, word_to_id)

context = autograd.Variable(torch.LongTensor(context_ids).view(-1,1), requires_grad = False) #.cuda()

response = autograd.Variable(torch.LongTensor(response_ids).view(-1, 1), requires_grad = False) #.cuda()

label = autograd.Variable(torch.FloatTensor(torch.from_numpy(np.array(label).reshape(1,1))), requires_grad = False) #.cuda()

score = dual_encoder(context, response)

loss = loss_func(score, label)

sum_loss_training += loss.data[0]

loss.backward()

optimizer.step()

optimizer.zero_grad()

training_correct_count = increase_count(training_correct_count, score, label)

training_accuracy = get_accuracy(training_correct_count, training_dataframe)

#plt.plot(epoch, training_accuracy)

shuffle_dataframe(validation_dataframe)

validation_correct_count = 0

sum_loss_validation = 0.0

dual_encoder.eval()

for index, row in validation_dataframe.iterrows():

context_ids, response_ids, label = load_ids_and_labels(row, word_to_id)

context = autograd.Variable(torch.LongTensor(context_ids).view(-1,1)) #.cuda()

response = autograd.Variable(torch.LongTensor(response_ids).view(-1, 1)) #.cuda()

label = autograd.Variable(torch.FloatTensor(torch.from_numpy(np.array(label).reshape(1,1)))) #.cuda()

score = dual_encoder(context, response)

loss = loss_func(score, label)

sum_loss_validation += loss.data[0]

validation_correct_count = increase_count(validation_correct_count, score, label)

validation_accuracy = get_accuracy(validation_correct_count, validation_dataframe)

print(str(datetime.datetime.now()).split('.')[0],
"Epoch: %d/%d" %(epoch,epochs),
"TrainLoss: %.3f" %(sum_loss_training/len(training_dataframe)),
"TrainAccuracy: %.3f" %(training_accuracy),
"ValLoss: %.3f" %(sum_loss_validation/len(validation_dataframe)),
"ValAccuracy: %.3f" %(validation_accuracy))

if validation_accuracy > best_validation_accuracy:
best_validation_accuracy = validation_accuracy
torch.save(dual_encoder.state_dict(), 'saved_model_%d_examples.pt' %(len(training_dataframe)))
print("New best found and saved.")

print(str(datetime.datetime.now()).split('.')[0], "Training and validation epochs finished.")
构建数据
1
2
3
training_dataframe, vocab, word_to_id, id_to_vec, emb_dim, validation_dataframe = creating_variables(num_training_examples = 10000, 
embedding_dim = 50,
num_validation_examples = 1000)
设定hidden size和dropout概率,构建模型
1
2
3
4
5
6
7
8
9
encoder, dual_encoder = creating_model(hidden_size = 50, 
p_dropout = 0.85)

#encoder.cuda()
#dual_encoder.cuda

for name, param in dual_encoder.named_parameters():
if param.requires_grad:
print(name)
设定学习率,迭代轮数,l2正则化强度,开始训练
1
2
3
train_model(learning_rate = 0.0001, 
l2_penalty = 0.0001,
epochs = 100)
加载训练好的模型进行测试
1
2
3
dual_encoder.load_state_dict(torch.load('saved_model_10000_examples.pt'))

dual_encoder.eval()
第1种测试方式:

测试数据集和训练还有验证数据集有着一样的数据组织格式 (context, response, label)
测试评判指标:准确率

1
test_dataframe_same_structure = pd.read_csv('testing_same_structure_1000.csv')

构建测试函数

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
def testing_same_structure():

test_correct_count = 0

for index, row in test_dataframe_same_structure.iterrows():

context_ids, response_ids, label = load_ids_and_labels(row, word_to_id)

context = autograd.Variable(torch.LongTensor(context_ids).view(-1,1)) #.cuda()

response = autograd.Variable(torch.LongTensor(response_ids).view(-1, 1)) #.cuda()

label = autograd.Variable(torch.FloatTensor(torch.from_numpy(np.array(label).reshape(1,1)))) #.cuda()

score = dual_encoder(context, response)

test_correct_count = increase_count(test_correct_count, score, label)

test_accuracy = get_accuracy(test_correct_count, test_dataframe_same_structure)

return test_accuracy

准确率

1
2
test_accuracy = testing_same_structure()
print("Test accuracy for %d training examples and %d test examples: %.2f" %(len(training_dataframe),len(test_dataframe_same_structure),test_accuracy))

第2种测试方式

测试数据集和训练/验证集格式不一样 (1个问题,1个标准答案,9个干扰项错误答案)
测试评估指标:recall(召回)
加载数据

1
test_dataframe_different_structure = pd.read_csv('testing_different_structure_100.csv')

以字典形态存储对话word ids
Outer dictionary “ids_per_example_and_candidate”: keys = examples, values = inner dictionaries
Inner dictionaries “ids_per_candidate”: keys = candidate names, values = list of word IDs

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
def load_ids(test_dataframe_different_structure, word_to_id):

print(str(datetime.datetime.now()).split('.')[0], "Loading test IDs...")

max_context_len = 160

ids_per_example_and_candidate = {}

for i, example in test_dataframe_different_structure.iterrows():

ids_per_candidate = {}

for column_name, cell in example.iteritems():

id_list = []

words = str(cell).split()
if len(words) > max_context_len:
words = words[:max_context_len]

for word in words:
if word in word_to_id:
id_list.append(word_to_id[word])
else:
id_list.append(0) #UNK

ids_per_candidate[column_name] = id_list

ids_per_example_and_candidate[i] = ids_per_candidate

print(str(datetime.datetime.now()).split('.')[0], "Test IDs loaded.")

return ids_per_example_and_candidate

1
ids_per_example_and_candidate = load_ids(test_dataframe_different_structure, word_to_id)

以字典形态存储得分score
Outer dictionary “scores_per_example_and_candidate”: keys = examples, values = inner dictionaries
Inner dictionaries “scores_per_candidate”: keys = candidate names, values = score

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
def load_scores(): 
print(str(datetime.datetime.now()).split('.')[0], "Computing test scores...")

scores_per_example_and_candidate = {}

for example, utterance_ids_dict in sorted(ids_per_example_and_candidate.items()):

score_per_candidate = {}

for utterance_name, ids_list in sorted(utterance_ids_dict.items()):

context = autograd.Variable(torch.LongTensor(utterance_ids_dict['Context']).view(-1,1))#.cuda()

if utterance_name != 'Context':

candidate_response = autograd.Variable(torch.LongTensor(utterance_ids_dict[utterance_name]).view(-1, 1))#.cuda()

score = torch.sigmoid(dual_encoder(context, candidate_response))

score_per_candidate["Score with " + utterance_name] = score.data[0][0]

scores_per_example_and_candidate[example] = score_per_candidate

print(str(datetime.datetime.now()).split('.')[0], "Test scores computed.")

return scores_per_example_and_candidate

1
scores_per_example_and_candidate = load_scores()

定义计算召回结果的方法:
这里计算的是recall@k这个评估指标。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
def get_recall_at_k(k):
count_true_hits = 0

for example, score_per_candidate_dict in sorted(scores_per_example_and_candidate.items()):

top_k = dict(sorted(score_per_candidate_dict.items(), key=operator.itemgetter(1), reverse=True)[:k])

if 'Score with Ground Truth Utterance' in top_k:
count_true_hits += 1

number_of_examples = len(scores_per_example_and_candidate)

recall_at_k = count_true_hits/number_of_examples

return recall_at_k

1
2
3
print("recall_at_5 =",get_recall_at_k(k = 5)) #Baseline expectation: 5/10 = 0.5 for random guess
print("recall_at_2 =",get_recall_at_k(k = 2)) #Baseline expectation: 2/10 = 0.2 for random guess
print("recall_at_1 =",get_recall_at_k(k = 1)) #Baseline expectation: 1/10 = 0.1 for random guess

建议把cuda()打开。在GPU上训练。
github