{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": { "scrolled": false }, "outputs": [], "source": [ "import os, random, datetime, pickle\n", "from datetime import datetime\n", "\n", "import numpy as np\n", "import pandas as pd\n", "import scipy.sparse as sp\n", "import tensorflow as tf" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "code_folding": [] }, "outputs": [], "source": [ "# reader\n", "\n", "class Reader(object):\n", " \n", " def read(self, data_path):\n", " handled_path = data_path + 'basic_trainer_saved.pkl'\n", "\n", " if os.path.exists(handled_path):\n", " print('load file from local')\n", " (self._entity_num, self._relation_num, self._relation_num_for_eval, self._train_data, self._test_data,\n", " self._valid_data) = pickle.load(open(handled_path, 'rb'))\n", " else:\n", " self.read_data()\n", " self.merge_id()\n", " self.add_reverse()\n", " self.reindex_kb()\n", " self.gen_t_label()\n", "\n", " print('start save dfs')\n", " saved = (\n", " self._entity_num, self._relation_num, self._relation_num_for_eval, self._train_data, self._test_data,\n", " self._valid_data)\n", " pickle.dump(saved, open(handled_path, 'wb'))\n", "\n", " self.gen_filter_mat()\n", " \n", " self._ent_num = self._entity_num\n", " self._rel_num = self._relation_num\n", " self._ent_mapping = pd.DataFrame({'kb_1':{}, 'kb_2':{}})\n", " self._rel_mapping = pd.DataFrame({'kb_1':{}, 'kb_2':{}})\n", " self._ent_testing = pd.DataFrame({'kb_1':{}, 'kb_2':{}})\n", " self._rel_testing = pd.DataFrame({'kb_1':{}, 'kb_2':{}})\n", " \n", " \n", " self._kb = self._train_data\n", " \n", " return \n", "\n", " def read_data(self):\n", " pass\n", "\n", " def merge_id(self):\n", " self._train_data['h_id'] = self._e_id[self._train_data.h].values\n", " self._train_data['r_id'] = self._r_id[self._train_data.r].values\n", " self._train_data['t_id'] = self._e_id[self._train_data.t].values\n", "\n", " self._test_data['h_id'] = self._e_id[self._test_data.h].values\n", " self._test_data['r_id'] = self._r_id[self._test_data.r].values\n", " self._test_data['t_id'] = self._e_id[self._test_data.t].values\n", "\n", " self._valid_data['h_id'] = self._e_id[self._valid_data.h].values\n", " self._valid_data['r_id'] = self._r_id[self._valid_data.r].values\n", " self._valid_data['t_id'] = self._e_id[self._valid_data.t].values\n", " \n", " def gen_t_label(self):\n", " full = pd.concat([self._train_data, self._test_data, self._valid_data], ignore_index=True)\n", " f_t_labels = full['t_id'].groupby([full['h_id'], full['r_id']]).apply(lambda x: pd.unique(x.values))\n", " f_t_labels.name = 't_label'\n", "\n", " self._test_data = self._test_data.join(f_t_labels, on=['h_id', 'r_id'])\n", "\n", " self._valid_data = self._valid_data.join(f_t_labels, on=['h_id', 'r_id'])\n", "\n", "\n", " def add_reverse(self):\n", " def add_reverse_for_data(data):\n", " reversed_data = data.rename(columns={'h_id': 't_id', 't_id': 'h_id'})\n", " reversed_data.r_id += self._relation_num\n", " data = pd.concat(([data, reversed_data]), ignore_index=True)\n", " return data\n", "\n", " self._train_data = add_reverse_for_data(self._train_data)\n", " self._test_data = add_reverse_for_data(self._test_data)\n", " self._valid_data = add_reverse_for_data(self._valid_data)\n", " self._relation_num_for_eval = self._relation_num\n", " self._relation_num *= 2\n", "# print (self._relation_num, self._relation_num_for_eval)\n", "\n", " def reindex_kb(self):\n", " train_data = self._train_data\n", " test_data = self._test_data\n", " valid_data = self._valid_data\n", " eids = pd.concat([train_data.h_id, train_data.t_id,], ignore_index=True)\n", "\n", " tv_eids = np.unique(pd.concat([test_data.h_id, test_data.t_id, valid_data.t_id, valid_data.h_id]))\n", " not_train_eids = tv_eids[~np.in1d(tv_eids, eids)]\n", "\n", " rids = pd.concat([train_data.r_id,],ignore_index=True)\n", " \n", "\n", " def gen_map(eids, rids):\n", " e_num = eids.groupby(eids.values).size().sort_values()[::-1]\n", " not_train = pd.Series(np.zeros_like(not_train_eids), index=not_train_eids)\n", " e_num = pd.concat([e_num, not_train])\n", "\n", " r_num = rids.groupby(rids.values).size().sort_values()[::-1]\n", " e_map = pd.Series(range(e_num.shape[0]), index=e_num.index)\n", " r_map = pd.Series(range(r_num.shape[0]), index=r_num.index)\n", " return e_map, r_map\n", " \n", " def remap_kb(kb, e_map, r_map):\n", " kb.loc[:, 'h_id'] = e_map.loc[kb.h_id.values].values\n", " kb.loc[:, 'r_id'] = r_map.loc[kb.r_id.values].values\n", " kb.loc[:, 't_id'] = e_map.loc[kb.t_id.values].values\n", " return kb\n", " \n", " def remap_id(s, rm):\n", " s = rm.loc[s.values].values\n", " return s\n", " \n", " e_map, r_map = gen_map(eids, rids)\n", " self._e_map, self._r_map = e_map, r_map\n", " \n", " self._train_data = remap_kb(train_data, e_map, r_map)\n", " self._valid_data = remap_kb(self._valid_data, e_map, r_map)\n", " self._test_data = remap_kb(self._test_data, e_map, r_map)\n", " \n", " self._e_id = remap_id(self._e_id, e_map)\n", " self._r_id = remap_id(self._r_id, r_map)\n", " \n", " return not_train_eids\n", " \n", " \n", " def in2d(self, arr1, arr2):\n", " \"\"\"Generalisation of numpy.in1d to 2D arrays\"\"\"\n", "\n", " assert arr1.dtype == arr2.dtype\n", "\n", " arr1_view = np.ascontiguousarray(arr1).view(np.dtype((np.void,\n", " arr1.dtype.itemsize * arr1.shape[1])))\n", " arr2_view = np.ascontiguousarray(arr2).view(np.dtype((np.void,\n", " arr2.dtype.itemsize * arr2.shape[1])))\n", " intersected = np.in1d(arr1_view, arr2_view)\n", " return intersected.view(np.bool).reshape(-1)\n", "\n", "\n", "\n", "\n", "\n", " def gen_filter_mat(self):\n", " def gen_filter_vector(r):\n", " v = np.ones(self._entity_num)\n", " v[r] = -1\n", " return v\n", "\n", " print('start gen filter mat')\n", "\n", "\n", "\n", " self._tail_valid_filter_mat = np.stack(self._valid_data.t_label.apply(gen_filter_vector).values)\n", " self._tail_test_filter_mat = np.stack(self._test_data.t_label.apply(gen_filter_vector).values)\n", "\n", "\n", "\n", " def gen_label_mat_for_train(self):\n", " def gen_train_relation_label_vac(r):\n", " c = pd.value_counts(r)\n", " values = 1. * c.values / c.sum()\n", " return np.stack([c.index, values], axis=1)\n", "\n", " def gen_train_entity_label_vac(r):\n", " indices = np.stack([r.label_id.values, r.values], axis=1)\n", " values = np.ones_like(r.values, dtype=np.int)\n", " return tf.SparseTensor(indices=indices, values=values, dense_shape=[1, self._entity_num])\n", "\n", " tr = self._train_data\n", " print('start gen t_label')\n", " labels = tr['t_id'].groupby([tr['h_id'], tr['r_id']]).size()\n", " labels = pd.Series(range(labels.shape[0]), index=labels.index)\n", " labels.name = 'label_id'\n", " tr = tr.join(labels, on=['h_id', 'r_id'])\n", "\n", " self._train_data = tr\n", " sp_tr = tf.SparseTensor(tr[['label_id', 't_id']].values, np.ones([len(tr)], dtype=np.float32), dense_shape=[len(tr), self._entity_num])\n", "\n", " self._label_indices, self._label_values = sp_tr.indices[:], sp_tr.values[:]\n", "\n", "\n", "class FreeBaseReader(Reader):\n", "\n", " def read_data(self):\n", " path = self._options.data_path\n", " tr = pd.read_csv(path + 'train.txt', header=None, sep='\\t', names=['h', 't', 'r'])\n", " te = pd.read_csv(path + 'test.txt', header=None, sep='\\t', names=['h', 't', 'r'])\n", " val = pd.read_csv(path + 'valid.txt', header=None, sep='\\t', names=['h', 't', 'r'])\n", "\n", " e_id = pd.read_csv(path + 'entity2id.txt', header=None, sep='\\t', names=['e', 'eid'])\n", " e_id = pd.Series(e_id.eid.values, index=e_id.e.values)\n", " r_id = pd.read_csv(path + 'relation2id.txt', header=None, sep='\\t', names=['r', 'rid'])\n", " r_id = pd.Series(r_id.rid.values, index=r_id.r.values)\n", " \n", " \n", "\n", " self._entity_num = e_id.shape[0]\n", " self._relation_num = r_id.shape[0]\n", "\n", "\n", " self._train_data = tr\n", " self._test_data = te\n", " self._valid_data = val\n", "\n", " self._e_id, self._r_id = e_id, r_id\n", "\n", "\n", "class WordNetReader(Reader):\n", "\n", " def read_data(self):\n", " path = self._options.data_path\n", " tr = pd.read_csv(path+'train.txt', header=None, sep='\\t', names=['h', 'r', 't'])\n", " te = pd.read_csv(path + 'test.txt', header=None, sep='\\t', names=['h', 'r', 't'])\n", " val = pd.read_csv(path + 'valid.txt', header=None, sep='\\t', names=['h', 'r', 't'])\n", " \n", " r_list = pd.unique(pd.concat([tr.r, te.r, val.r]))\n", " r_list = pd.Series(r_list, index=np.arange(r_list.shape[0]))\n", " \n", " e_list = pd.unique(pd.concat([tr.h, te.h, val.h, tr.t, te.t, val.t, ]))\n", " e_list = pd.Series(e_list, index=np.arange(e_list.shape[0]))\n", " \n", " \n", " e_id = pd.Series(e_list.index, index=e_list.values)\n", " r_id = pd.Series(r_list.index, index=r_list.values)\n", "\n", "\n", "\n", "\n", " self._entity_num = e_id.shape[0]\n", " self._relation_num = r_id.shape[0]\n", "\n", " self._train_data = tr\n", " self._test_data = te\n", " self._valid_data = val\n", "\n", " self._e_id, self._r_id = e_id, r_id" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "code_folding": [] }, "outputs": [], "source": [ "# path sampler\n", "\n", "\n", "class BasicSampler(object):\n", "\n", " def sample_paths(self, repeat_times=2):\n", " opts = self._options\n", "\n", " kb = self._kb.copy()\n", "\n", " kb = kb[['h_id', 'r_id', 't_id']]\n", "\n", " # sampling triples with the h_id-(r_id,t_id) form.\n", "\n", " rtlist = np.unique(kb[['r_id', 't_id']].values, axis=0)\n", "\n", " rtdf = pd.DataFrame(rtlist, columns=['r_id', 't_id'])\n", "\n", " rtdf = rtdf.reset_index().rename({'index': 'tail_id'}, axis='columns')\n", "\n", " rtkb = kb.merge(\n", " rtdf, left_on=['r_id', 't_id'], right_on=['r_id', 't_id'])\n", "\n", " htail = np.unique(rtkb[['h_id', 'tail_id']].values, axis=0)\n", "\n", " htailmat = csr_matrix((np.ones(len(htail)), (htail[:, 0], htail[:, 1])),\n", " shape=(model._ent_num, rtlist.shape[0]))\n", "\n", " # calulate corss-KG bias at first\n", " em = pd.concat(\n", " [model._ent_mapping.kb_1, model._ent_mapping.kb_2]).values\n", "\n", " rtkb['across'] = rtkb.t_id.isin(em)\n", " rtkb.loc[rtkb.across, 'across'] = opts.beta\n", " rtkb.loc[rtkb.across == 0, 'across'] = 1-opts.beta\n", "\n", " rtailkb = rtkb[['h_id', 't_id', 'tail_id', 'across']]\n", "\n", " def gen_tail_dict(x):\n", " return x.tail_id.values, x.across.values / x.across.sum()\n", "\n", " rtailkb = rtailkb.groupby('h_id').apply(gen_tail_dict)\n", "\n", " rtailkb = pd.DataFrame({'tails': rtailkb})\n", "\n", " # start sampling\n", "\n", " hrt = np.repeat(kb.values, repeat_times, axis=0)\n", "\n", " # for initial triples\n", " def perform_random(x):\n", " return np.random.choice(x.tails[0], 1, p=x.tails[1].astype(np.float))\n", "\n", " # else\n", " def perform_random2(x):\n", "\n", " # calculate depth bias\n", " pre_c = htailmat[np.repeat(x.pre, x.tails[0].shape[0]), x.tails[0]]\n", " pre_c[pre_c == 0] = opts.alpha\n", " pre_c[pre_c == 1] = 1-opts.alpha\n", " p = x.tails[1].astype(np.float).reshape(\n", " [-1, ]) * pre_c.A.reshape([-1, ])\n", " p = p / p.sum()\n", " return np.random.choice(x.tails[0], 1, p=p)\n", "\n", " rt_x = rtailkb.loc[hrt[:, 2]].apply(perform_random, axis=1)\n", " rt_x = rtlist[np.concatenate(rt_x.values)]\n", "\n", " rts = [hrt, rt_x]\n", " c_length = 5\n", " while(c_length < opts.max_length):\n", " curr = rtailkb.loc[rt_x[:, 1]]\n", " curr.loc[:, 'pre'] = hrt[:, 0]\n", "\n", " rt_x = curr.apply(perform_random2, axis=1)\n", " rt_x = rtlist[np.concatenate(rt_x.values)]\n", "\n", " rts.append(rt_x)\n", " c_length += 2\n", "\n", " data = np.concatenate(rts, axis=1)\n", " data = pd.DataFrame(data)\n", " \n", " self._train_data = data\n", " data.to_csv('%spaths_%.1f_%.1f' % (opts.data_path, opts.alpha, opts.beta))\n", " " ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "code_folding": [ 2, 7, 50, 60, 94, 111 ], "scrolled": false }, "outputs": [], "source": [ "# model\n", "class RSN4KGC(FreeBaseReader):\n", " def __init__(self, options, session):\n", " self._options = options\n", " self._session = session\n", "\n", "\n", " def init_variables(self):\n", " options = self._options\n", " hidden_size = options.hidden_size\n", "\n", "\n", " self._entity_embedding = tf.get_variable(\n", " 'entity_embedding',\n", " [self._ent_num, hidden_size],\n", " initializer=tf.contrib.layers.xavier_initializer()\n", " )\n", "\n", "\n", " self._relation_embedding = tf.get_variable(\n", " 'relation_embedding',\n", " [self._rel_num, hidden_size],\n", " initializer=tf.contrib.layers.xavier_initializer()\n", " )\n", "\n", " self._rel_w = tf.get_variable(\n", " \"relation_softmax_w\",\n", " [self._rel_num, hidden_size],\n", " initializer=tf.contrib.layers.xavier_initializer()\n", " )\n", " self._rel_b = tf.get_variable(\n", " \"relation_softmax_b\",\n", " [self._rel_num],\n", " initializer=tf.constant_initializer(0)\n", " )\n", " self._ent_w = tf.get_variable(\n", " \"entity_softmax_w\",\n", " [self._ent_num, hidden_size],\n", " initializer=tf.contrib.layers.xavier_initializer()\n", " )\n", " self._ent_b = tf.get_variable(\n", " \"entity_softmax_b\",\n", " [self._ent_num],\n", " initializer=tf.constant_initializer(0)\n", " )\n", "\n", " self._lr = tf.Variable(options.learning_rate, trainable=False)\n", "\n", " self._optimizer = tf.train.AdamOptimizer(options.learning_rate)\n", "\n", " def bn(self, inputs, is_train=True, reuse=True):\n", " return tf.contrib.layers.batch_norm(inputs,\n", " center=True,\n", " scale=True,\n", " is_training=is_train,\n", " reuse=reuse,\n", " scope='bn',\n", " data_format='NCHW'\n", " )\n", "\n", " def lstm_cell(self, drop=True, keep_prob=0.5, num_layers=2, hidden_size=None):\n", " if not hidden_size:\n", " hidden_size = self._options.hidden_size\n", "\n", " def basic_lstm_cell():\n", " return tf.contrib.rnn.LSTMCell(\n", " num_units=hidden_size,\n", " initializer=tf.orthogonal_initializer,\n", " forget_bias=1,\n", " reuse=tf.get_variable_scope().reuse,\n", " activation=tf.identity\n", " )\n", "\n", "\n", " def drop_cell():\n", " return tf.contrib.rnn.DropoutWrapper(\n", " basic_lstm_cell(),\n", " output_keep_prob=keep_prob\n", " )\n", "\n", " if drop:\n", " gen_cell = drop_cell\n", " else:\n", " gen_cell = basic_lstm_cell\n", " \n", " if num_layers==0:\n", " return gen_cell()\n", " \n", " cell = tf.contrib.rnn.MultiRNNCell(\n", " [gen_cell() for _ in range(num_layers)],\n", " state_is_tuple=True,\n", " )\n", " return cell\n", "\n", " def sampled_loss(self, inputs, labels, w, b, weight=1, is_entity=False):\n", " num_sampled = min(self._options.num_samples, w.shape[0]//3)\n", " \n", " labels = tf.reshape(labels, [-1, 1])\n", "\n", " losses = tf.nn.sampled_softmax_loss(\n", " weights=w,\n", " biases=b,\n", " labels=labels,\n", " inputs=tf.reshape(inputs, [-1, int(w.shape[1])]),\n", " num_sampled=num_sampled,\n", " num_classes=w.shape[0],\n", " partition_strategy='div',\n", " )\n", "\n", " return losses * weight\n", "\n", " def logits(self, input, predict_relation=True):\n", " if not predict_relation:\n", " w = self._ent_w\n", " b = self._ent_b\n", " else:\n", " w = self._rel_w\n", " b = self._rel_b\n", " \n", " return tf.nn.bias_add(tf.matmul(input, tf.transpose(w)), b)\n", "\n", " def sample(self, data):\n", " choices = np.random.choice(len(data), size=len(data), replace=False)\n", " return data.iloc[choices]\n", " \n", " def padding_data(self, data):\n", " padding_num = self._options.batch_size - len(data) % self._options.batch_size\n", " data = np.concatenate([data, np.zeros((padding_num, data.shape[1]), dtype=np.int32)])\n", " return data, padding_num" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "code_folding": [ 0 ] }, "outputs": [], "source": [ "# build tensorflow graph\n", "\n", "\n", "# build an RSN of length l\n", "def build_sub_graph(self, length=15, reuse=False):\n", " options = self._options\n", " hidden_size = options.hidden_size\n", " batch_size = options.batch_size\n", "\n", " seq = tf.placeholder(\n", " tf.int32, [batch_size, length], name='seq'+str(length))\n", "\n", " e_em, r_em = self._entity_embedding, self._relation_embedding\n", "\n", " # seperately read, and then recover the order\n", " ent = seq[:, :-1:2]\n", " rel = seq[:, 1::2]\n", "\n", " ent_em = tf.nn.embedding_lookup(e_em, ent)\n", " rel_em = tf.nn.embedding_lookup(r_em, rel)\n", "\n", " em_seq = []\n", " for i in range(length-1):\n", " if i % 2 == 0:\n", " em_seq.append(ent_em[:, i//2])\n", " else:\n", " em_seq.append(rel_em[:, i//2])\n", "\n", " # seperately bn\n", " with tf.variable_scope('input_bn'):\n", " if not reuse:\n", " bn_em_seq = [tf.reshape(self.bn(em_seq[i], reuse=(\n", " i is not 0)), [-1, 1, hidden_size]) for i in range(length-1)]\n", " else:\n", " bn_em_seq = [tf.reshape(\n", " self.bn(em_seq[i], reuse=True), [-1, 1, hidden_size]) for i in range(length-1)]\n", "\n", " bn_em_seq = tf.concat(bn_em_seq, axis=1)\n", "\n", " ent_bn_em = bn_em_seq[:, ::2]\n", "\n", " with tf.variable_scope('rnn', reuse=reuse):\n", "\n", " cell = self.lstm_cell(True, options.keep_prob, options.num_layers)\n", "\n", " outputs, state = tf.nn.dynamic_rnn(cell, bn_em_seq, dtype=tf.float32)\n", "\n", " rel_outputs = outputs[:, 1::2, :]\n", " outputs = [outputs[:, i, :] for i in range(length-1)]\n", "\n", " ent_outputs = outputs[::2]\n", "\n", " # RSN\n", " with tf.variable_scope('resnet', reuse=reuse):\n", " res_rel_outputs = tf.contrib.layers.fully_connected(rel_outputs, hidden_size, biases_initializer=None, activation_fn=None) +\\\n", " tf.contrib.layers.fully_connected(\n", " ent_bn_em, hidden_size, biases_initializer=None, activation_fn=None)\n", "\n", " # recover the order\n", " res_rel_outputs = [res_rel_outputs[:, i, :] for i in range((length-1)//2)]\n", " outputs = []\n", " for i in range(length-1):\n", " if i % 2 == 0:\n", " outputs.append(ent_outputs[i//2])\n", " else:\n", " outputs.append(res_rel_outputs[i//2])\n", "\n", " # output bn\n", " with tf.variable_scope('output_bn'):\n", " if reuse:\n", " bn_outputs = [tf.reshape(\n", " self.bn(outputs[i], reuse=True), [-1, 1, hidden_size]) for i in range(length-1)]\n", " else:\n", " bn_outputs = [tf.reshape(self.bn(outputs[i], reuse=(\n", " i is not 0)), [-1, 1, hidden_size]) for i in range(length-1)]\n", "\n", " def cal_loss(bn_outputs, seq):\n", " losses = []\n", "\n", " decay = 0.8\n", " for i, output in enumerate(bn_outputs):\n", " if i % 2 == 0:\n", " losses.append(self.sampled_loss(\n", " output, seq[:, i+1], self._rel_w, self._rel_b, weight=decay**(0), is_entity=i))\n", " else:\n", " losses.append(self.sampled_loss(\n", " output, seq[:, i+1], self._ent_w, self._ent_b, weight=decay**(0), is_entity=i))\n", " losses = tf.stack(losses, axis=1)\n", " return losses\n", "\n", " seq_loss = cal_loss(bn_outputs, seq)\n", "\n", " losses = tf.reduce_sum(seq_loss) / batch_size\n", "\n", " return losses, seq\n", "\n", "\n", "# build the main graph\n", "def build_graph(self):\n", " options = self._options\n", "\n", " loss, seq = build_sub_graph(self, length=options.max_length, reuse=False)\n", "\n", " tvars = tf.trainable_variables()\n", " grads, _ = tf.clip_by_global_norm(tf.gradients(loss, tvars), 2.0)\n", " update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)\n", " with tf.control_dependencies(update_ops):\n", " train_op = self._optimizer.apply_gradients(\n", " zip(grads, tvars),\n", " global_step=tf.train.get_or_create_global_step()\n", " )\n", "\n", " self._seq, self._loss, self._train_op = seq, loss, train_op" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "code_folding": [ 2 ] }, "outputs": [], "source": [ "# training procedure\n", "\n", "def seq_train(self, data, choices=None, epoch=None):\n", " opts = self._options\n", " \n", " # shuffle data\n", " choices = np.random.choice(len(data), size=len(data), replace=True)\n", " batch_size = opts.batch_size\n", " \n", " \n", " num_batch = len(data) // batch_size\n", " \n", " fetches = {\n", " 'loss': self._loss,\n", " 'train_op': self._train_op\n", " }\n", " \n", " losses = 0 \n", " for i in range(num_batch):\n", " \n", " one_batch_choices = choices[i * batch_size : (i + 1) * batch_size]\n", " one_batch_data = data.iloc[one_batch_choices]\n", "\n", " feed_dict = {}\n", " seq = one_batch_data.values[:, :opts.max_length]\n", " feed_dict[self._seq] = seq\n", " vals = self._session.run(fetches, feed_dict)\n", "\n", " del one_batch_data\n", "\n", " loss = vals['loss']\n", " losses += loss\n", " print('\\r%i/%i, batch_loss:%f' % (i, num_batch, loss), end='')\n", " self._last_mean_loss = losses / num_batch\n", "\n", " return self._last_mean_loss" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "code_folding": [] }, "outputs": [], "source": [ "#build the graph for entity prediction (KG completion)\n", "\n", "def build_eval_ep(self, length=3, reuse=True):\n", " options = self._options\n", " hidden_size = options.hidden_size\n", " batch_size = options.batch_size\n", "\n", " seq = tf.placeholder(tf.int32, [batch_size, length], name='eval_seq')\n", " \n", " e_em, r_em = self._entity_embedding, self._relation_embedding\n", " \n", " ent = seq[:, :-1:2]\n", " rel = seq[:, 1::2]\n", "\n", " ent_em = tf.nn.embedding_lookup(e_em, ent)\n", " rel_em = tf.nn.embedding_lookup(r_em, rel)\n", " \n", " \n", " em_seq = []\n", " for i in range(length-1):\n", " if i % 2 == 0:\n", " em_seq.append(ent_em[:, i//2])\n", " else:\n", " em_seq.append(rel_em[:, i//2])\n", " \n", "\n", " with tf.variable_scope('input_bn', reuse=reuse):\n", " if not reuse:\n", " bn_em_seq = [tf.reshape(self.bn(em_seq[i], reuse=(\n", " i is not 0)), [-1, 1, hidden_size]) for i in range(length-1)]\n", " else:\n", " bn_em_seq = [tf.reshape(\n", " self.bn(em_seq[i], reuse=True), [-1, 1, hidden_size]) for i in range(length-1)]\n", " \n", " \n", " \n", " bn_em_seq = tf.concat(bn_em_seq, axis=1)\n", " \n", " ent_bn_em = bn_em_seq[:, ::2]\n", " \n", " \n", "\n", " with tf.variable_scope('rnn', reuse=reuse):\n", "\n", " cell = self.lstm_cell(True, options.keep_prob, options.num_layers)\n", "\n", " outputs, state = tf.nn.dynamic_rnn(cell, bn_em_seq, dtype=tf.float32)\n", " \n", " \n", " rel_outputs = outputs[:, 1::2, :]\n", " outputs = [outputs[:, i, :] for i in range(length-1)]\n", " \n", " ent_outputs = outputs[::2]\n", "\n", "\n", " with tf.variable_scope('resnet', reuse=reuse):\n", " res_rel_outputs = tf.contrib.layers.fully_connected(rel_outputs, hidden_size, biases_initializer=None, activation_fn=None) +\\\n", " tf.contrib.layers.fully_connected(ent_bn_em, hidden_size, biases_initializer=None, activation_fn=None)\n", "\n", "\n", " res_rel_outputs = [res_rel_outputs[:, i, :] for i in range((length-1)//2)]\n", "\n", " outputs=[]\n", " \n", " for i in range(length-1):\n", " if i % 2==0:\n", " outputs.append(ent_outputs[i//2])\n", " else:\n", " outputs.append(res_rel_outputs[i//2])\n", " \n", " with tf.variable_scope('output_bn', reuse=reuse):\n", " if reuse:\n", " bn_outputs = [tf.reshape(\n", " self.bn(outputs[i], reuse=True), [-1, hidden_size]) for i in range(length-1)]\n", " else:\n", " bn_outputs = [tf.reshape(self.bn(outputs[i], reuse=(\n", " i is not 0)), [-1, hidden_size]) for i in range(length-1)]\n", " \n", " \n", " logits = self.logits(bn_outputs[1], predict_relation=False)\n", "\n", " probs = tf.nn.softmax(logits)\n", "\n", " self._eval_seq = seq\n", "\n", " self._entity_probs = probs\n" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "code_folding": [ 2 ] }, "outputs": [], "source": [ "#evaluate the performance on KG completion (entity prediction)\n", "\n", "def eval_entity_prediction(model, data, filter_mat, method='min', return_ranks=False, return_probs=False, return_label_probs=False):\n", " options = model._options\n", " batch_size = options.batch_size\n", "\n", " label = data[:, 2]\n", " \n", " data, padding_num = model.padding_data(data)\n", "\n", " num_batch = len(data) // batch_size \n", " \n", " eval_seq, fectch_entity_probs = model._eval_seq, model._entity_probs\n", " \n", " probs = []\n", " for i in range(num_batch):\n", "\n", " feed_dict = {}\n", " feed_dict[eval_seq] = data[i * batch_size:(i + 1) * batch_size]\n", " \n", " probs.append(sess.run(fectch_entity_probs, feed_dict))\n", " probs = np.concatenate(probs)[:len(data) - padding_num]\n", " if return_label_probs:\n", " return probs[range(len(label)), label]\n", " \n", " if return_probs:\n", " return probs\n", " filter_probs = probs * filter_mat\n", " filter_probs[range(len(label)), label] = probs[range(len(label)), label]\n", " filter_ranks = cal_ranks(filter_probs, method=method, label=label)\n", " if return_ranks:\n", " return filter_ranks\n", " _, f_h_1, _ = cal_performance(filter_ranks, top=1)\n", " f_m_r, f_h_10, f_mrr = cal_performance(filter_ranks)\n", " \n", " return (f_h_1, f_h_10, f_mrr, f_m_r)" ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "code_folding": [] }, "outputs": [], "source": [ "# some tool functions\n", "def cal_ranks(probs, method, label):\n", " if method == 'min':\n", " probs = probs - probs[range(len(label)), label].reshape(len(probs), 1)\n", " ranks = (probs > 0).sum(axis=1) + 1\n", " else:\n", " ranks = pd.DataFrame(probs).rank(axis=1, ascending=False, method=method)\n", " ranks = ranks.values[range(len(label)), label]\n", " return ranks\n", "\n", "def cal_performance(ranks, top=10):\n", " m_r = sum(ranks) * 1.0 / len(ranks)\n", " h_10 = sum(ranks <= top) * 1.0 / len(ranks)\n", " mrr = (1. / ranks).sum() / len(ranks)\n", " return m_r, h_10, mrr\n", "\n", "def padding_data(data, options, batch_size):\n", " padding_num = batch_size - len(data) % batch_size\n", " data = pd.concat([data, pd.DataFrame(np.zeros((padding_num, data.shape[1])), dtype=np.int32, columns=data.columns)],ignore_index=True, axis=0)\n", " return data, padding_num\n", "\n", "def in2d(arr1, arr2):\n", " \"\"\"Generalisation of numpy.in1d to 2D arrays\"\"\"\n", "\n", " assert arr1.dtype == arr2.dtype\n", "\n", " arr1_view = np.ascontiguousarray(arr1).view(np.dtype((np.void,\n", " arr1.dtype.itemsize * arr1.shape[1])))\n", " arr2_view = np.ascontiguousarray(arr2).view(np.dtype((np.void,\n", " arr2.dtype.itemsize * arr2.shape[1])))\n", " intersected = np.in1d(arr1_view, arr2_view)\n", " return intersected.view(np.bool).reshape(-1)\n", "\n", "def write_to_log(path, content):\n", " with open(path, 'a+') as f:\n", " print(content, file=f)" ] }, { "cell_type": "code", "execution_count": 10, "metadata": { "scrolled": true }, "outputs": [], "source": [ "#parameter settings\n", "\n", "class Options(object):\n", " pass\n", "\n", "opts = Options()\n", "opts.hidden_size = 256\n", "opts.num_layers = 2\n", "opts.batch_size = 2048\n", "opts.learning_rate = 0.0001 # for FB15K-237, the learning rate should decrease to 0.00001\n", "opts.num_samples = 2048*5\n", "opts.keep_prob = 0.5\n", "\n", "opts.max_length=7\n", "opts.alpha = 0.7\n", "opts.beta = 0.5\n", "\n", "\n", "opts.data_path = 'data/FB15k/'\n", "opts.log_file_path = 'logs/%s%dl_%s.log' % (opts.data_path.replace(\n", " '/', '-'), opts.max_length, datetime.now().strftime('%y-%m-%d-%H-%M'))\n", "\n", "config = tf.ConfigProto()\n", "config.gpu_options.allow_growth = True" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#initial model\n", "\n", "sess = tf.InteractiveSession(config=config)\n", "model = RSN4KGC(options=opts, session=sess)\n", "\n", "model.read(data_path=model._options.data_path)\n", "model.init_variables()\n", "\n", "sequence_datapath = '%spaths_%.1f_%.1f' % (\n", " model._options.data_path, model._options.alpha, model._options.beta)\n", "\n", "if not os.path.exists(sequence_datapath):\n", " print('start to sample paths')\n", " model.sample_paths()\n", " train_data = model._train_data\n", "else:\n", " print('load existing training sequences')\n", " train_data = pd.read_csv(sequence_datapath, index_col=0)\n", "\n", "\n", "\n", "# build tensorflow graph and init all tensors\n", "build_graph(model)\n", "build_eval_ep(model)\n", "tf.global_variables_initializer().run()" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [], "source": [ "# initial training settings\n", "\n", "write_to_log(opts.log_file_path, opts.__dict__)\n", "epoch = 0\n", "last_mean_loss=0\n", "\n", "max_hits1, times, max_times = 0, 0, 3\n", "\n", "# we transform the subject entity prediction (?, r, o) to (o, r-, ?) for convenience\n", "test_data = model._test_data[['h_id', 'r_id', 't_id']].values\n", "filter_mat = model._tail_test_filter_mat\n", "\n", "valid_data = model._valid_data[['h_id', 'r_id', 't_id']].values\n", "vfilter_mat = model._tail_valid_filter_mat" ] }, { "cell_type": "code", "execution_count": 13, "metadata": { "code_folding": [], "scrolled": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "epoch:0, Hits@1:0.000, Hits@10:0.001, MRR:0.001, MR:7414.918, mean_loss:0.000\n", "942/943, batch_loss:24.576305\n", "epoch:1, Hits@1:0.183, Hits@10:0.382, MRR:0.252, MR:332.066, mean_loss:33.139\n", "942/943, batch_loss:15.754082\n", "epoch:6, Hits@1:0.489, Hits@10:0.762, MRR:0.586, MR:64.169, mean_loss:15.935\n", "942/943, batch_loss:14.618158\n", "epoch:11, Hits@1:0.595, Hits@10:0.822, MRR:0.676, MR:50.366, mean_loss:14.690\n", "942/943, batch_loss:14.370004\n", "epoch:16, Hits@1:0.645, Hits@10:0.850, MRR:0.719, MR:46.891, mean_loss:14.343\n", "942/943, batch_loss:14.146772\n", "epoch:21, Hits@1:0.670, Hits@10:0.862, MRR:0.740, MR:45.969, mean_loss:14.187\n", "942/943, batch_loss:14.086667\n", "epoch:26, Hits@1:0.680, Hits@10:0.865, MRR:0.748, MR:46.136, mean_loss:14.099\n", "942/943, batch_loss:13.916122\n", "epoch:31, Hits@1:0.688, Hits@10:0.868, MRR:0.754, MR:46.376, mean_loss:14.034\n", "942/943, batch_loss:13.882902\n", "epoch:36, Hits@1:0.695, Hits@10:0.869, MRR:0.758, MR:46.269, mean_loss:13.989\n", "942/943, batch_loss:13.909997\n", "epoch:41, Hits@1:0.696, Hits@10:0.869, MRR:0.759, MR:46.180, mean_loss:13.950\n", "942/943, batch_loss:13.895548\n", "epoch:46, Hits@1:0.698, Hits@10:0.870, MRR:0.761, MR:46.885, mean_loss:13.916\n", "942/943, batch_loss:13.814245\n", "epoch:51, Hits@1:0.701, Hits@10:0.870, MRR:0.763, MR:46.271, mean_loss:13.888\n", "942/943, batch_loss:13.757550\n", "epoch:56, Hits@1:0.704, Hits@10:0.870, MRR:0.765, MR:45.982, mean_loss:13.863\n", "942/943, batch_loss:13.820094\n", "epoch:61, Hits@1:0.705, Hits@10:0.871, MRR:0.766, MR:46.494, mean_loss:13.836\n", "942/943, batch_loss:13.762841\n", "epoch:66, Hits@1:0.707, Hits@10:0.871, MRR:0.767, MR:46.543, mean_loss:13.812\n", "942/943, batch_loss:13.859014\n", "epoch:71, Hits@1:0.707, Hits@10:0.872, MRR:0.768, MR:46.826, mean_loss:13.793\n", "942/943, batch_loss:13.633964\n", "epoch:76, Hits@1:0.711, Hits@10:0.872, MRR:0.770, MR:47.177, mean_loss:13.769\n", "942/943, batch_loss:13.731867\n", "epoch:81, Hits@1:0.712, Hits@10:0.871, MRR:0.771, MR:46.326, mean_loss:13.745\n", "942/943, batch_loss:13.592937\n", "epoch:86, Hits@1:0.714, Hits@10:0.873, MRR:0.772, MR:46.850, mean_loss:13.725\n", "942/943, batch_loss:13.760557\n", "epoch:91, Hits@1:0.715, Hits@10:0.873, MRR:0.773, MR:47.007, mean_loss:13.703\n", "942/943, batch_loss:13.642200\n", "epoch:96, Hits@1:0.717, Hits@10:0.873, MRR:0.774, MR:47.306, mean_loss:13.686\n", "942/943, batch_loss:13.576491\n", "epoch:101, Hits@1:0.717, Hits@10:0.873, MRR:0.774, MR:47.622, mean_loss:13.660\n", "942/943, batch_loss:13.632428\n", "epoch:106, Hits@1:0.719, Hits@10:0.873, MRR:0.776, MR:47.280, mean_loss:13.645\n", "942/943, batch_loss:13.591526\n", "epoch:111, Hits@1:0.719, Hits@10:0.872, MRR:0.775, MR:47.591, mean_loss:13.626\n", "942/943, batch_loss:13.612853\n", "epoch:116, Hits@1:0.720, Hits@10:0.872, MRR:0.775, MR:47.870, mean_loss:13.597\n", "942/943, batch_loss:13.414848\n", "epoch:121, Hits@1:0.723, Hits@10:0.873, MRR:0.777, MR:47.483, mean_loss:13.576\n", "\n", "epoch:121, Hits@1:0.721, Hits@10:0.873, MRR:0.777, MR:46.839, mean_loss:13.576\n" ] } ], "source": [ "r = eval_entity_prediction(model, data=valid_data, filter_mat=vfilter_mat)\n", "msg = 'epoch:%i, Hits@1:%.3f, Hits@10:%.3f, MRR:%.3f, MR:%.3f, mean_loss:%.3f' % (epoch, r[0],r[1],r[2],r[3], last_mean_loss)\n", "print('\\n'+msg)\n", "write_to_log(opts.log_file_path, msg)\n", "\n", "\n", "for i in range(epoch, 200):\n", " last_mean_loss = seq_train(model, train_data)\n", " epoch += 1\n", " \n", " # evaluation\n", " if i % 5 ==0:\n", " r = eval_entity_prediction(model, data=valid_data, filter_mat=vfilter_mat)\n", " msg = 'epoch:%i, Hits@1:%.3f, Hits@10:%.3f, MRR:%.3f, MR:%.3f, mean_loss:%.3f' % (i+1, r[0],r[1],r[2],r[3], last_mean_loss)\n", " print('\\n'+msg)\n", " write_to_log(opts.log_file_path, msg)\n", " \n", " hits1 = r[0]\n", " \n", " # early stop\n", " if hits1 > max_hits1:\n", " max_hits1 = hits1\n", " times = 0\n", " else:\n", " times += 1\n", "\n", " if times >= max_times:\n", " break\n", " \n", "#evaluation on testing data\n", "r = eval_entity_prediction(model, data=test_data, filter_mat=filter_mat, method='average')\n", "msg = 'epoch:%i, Hits@1:%.3f, Hits@10:%.3f, MRR:%.3f, MR:%.3f, mean_loss:%.3f' % (epoch, r[0],r[1],r[2],r[3], last_mean_loss)\n", "print('\\n'+msg)\n", "write_to_log(opts.log_file_path, msg)" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.3" } }, "nbformat": 4, "nbformat_minor": 2 }