0


Deep Interest Evolution Network(DIEN)专题3:代码解析之模型训练和模型结构

接上一节数据处理,本节将详细介绍训练和网络模型部分的代码,为了配合python3的执行,部分代码做了修改,先给出整个train.py的加注解代码:

  1. import numpy
  2. from data_iterator import DataIterator
  3. import tensorflow as tf
  4. from model import *
  5. import time
  6. import random
  7. import sys
  8. from utils import *
  9. EMBEDDING_DIM = 18
  10. HIDDEN_SIZE = 18 * 2
  11. ATTENTION_SIZE = 18 * 2
  12. best_auc = 0.0
  13. def prepare_data(input, target, maxlen = None, return_neg = False):
  14. # x: a list of sentences
  15. # input: N个训练样本,每一行格式如下:
  16. # 用户id[0], 商品id[1], 商品分类[2],之前点过商品(n个)[3],之前点个商品分类(n个)[4],没点过商品(n*5个)[5],没点过商品分类(n*5个)[6]
  17. # label: 正样本或者负样本
  18. lengths_x = [len(s[4]) for s in input] # N, 每个样本之前点击商品的个数
  19. seqs_mid = [inp[3] for inp in input] # N*n, 之前点过商品序列
  20. seqs_cat = [inp[4] for inp in input] # N * n, 之前点过商品分类序列
  21. noclk_seqs_mid = [inp[5] for inp in input] # N * n * 5, 之前没点过商品序列
  22. noclk_seqs_cat = [inp[6] for inp in input] # N * n * 5, 之前没点过商品分类
  23. if maxlen is not None:
  24. new_seqs_mid = []
  25. new_seqs_cat = []
  26. new_noclk_seqs_mid = []
  27. new_noclk_seqs_cat = []
  28. new_lengths_x = []
  29. for l_x, inp in zip(lengths_x, input): # zip生成组元组成的list,长度与最小list长度一致
  30. if l_x > maxlen:
  31. new_seqs_mid.append(inp[3][l_x - maxlen:])
  32. new_seqs_cat.append(inp[4][l_x - maxlen:])
  33. new_noclk_seqs_mid.append(inp[5][l_x - maxlen:])
  34. new_noclk_seqs_cat.append(inp[6][l_x - maxlen:])
  35. new_lengths_x.append(maxlen)
  36. else:
  37. new_seqs_mid.append(inp[3])
  38. new_seqs_cat.append(inp[4])
  39. new_noclk_seqs_mid.append(inp[5])
  40. new_noclk_seqs_cat.append(inp[6])
  41. new_lengths_x.append(l_x)
  42. lengths_x = new_lengths_x
  43. seqs_mid = new_seqs_mid
  44. seqs_cat = new_seqs_cat
  45. noclk_seqs_mid = new_noclk_seqs_mid
  46. noclk_seqs_cat = new_noclk_seqs_cat
  47. if len(lengths_x) < 1:
  48. return None, None, None, None
  49. n_samples = len(seqs_mid) # 样本数 N
  50. maxlen_x = numpy.max(lengths_x) # 之前最多的点击样本个数;
  51. if maxlen_x <= 1:
  52. maxlen_x = 2
  53. neg_samples = len(noclk_seqs_mid[0][0]) # 每一次之前点击行为对应的负样本个数
  54. mid_his = numpy.zeros((n_samples, maxlen_x)).astype('int64') # N * maxLen_x 之前点击item id 序列
  55. cat_his = numpy.zeros((n_samples, maxlen_x)).astype('int64') # N * maxLen_x 之前点击item 分类 序列
  56. noclk_mid_his = numpy.zeros((n_samples, maxlen_x, neg_samples)).astype('int64') # N * maxLen_x * ngsample(5), 之前每次点击对应负样本
  57. noclk_cat_his = numpy.zeros((n_samples, maxlen_x, neg_samples)).astype('int64') # N * maxLen_x * ngsample(5), 之前每次点击对应负样本分类
  58. mid_mask = numpy.zeros((n_samples, maxlen_x)).astype('float32') # N * maxLen_x 实际之前点击序列长度
  59. for idx, [s_x, s_y, no_sx, no_sy] in enumerate(zip(seqs_mid, seqs_cat, noclk_seqs_mid, noclk_seqs_cat)):
  60. mid_mask[idx, :lengths_x[idx]] = 1. # 第idx个样本,前lengths_x[idx]置为1,即有点击的位置置为1.
  61. mid_his[idx, :lengths_x[idx]] = s_x # 第idx个样本,之前点过的商品id序列
  62. cat_his[idx, :lengths_x[idx]] = s_y # 第idx个样本,之前点过的商品分类序列
  63. noclk_mid_his[idx, :lengths_x[idx], :] = no_sx # 第idx个样本,没点过负样本id
  64. noclk_cat_his[idx, :lengths_x[idx], :] = no_sy # 第idx个样本,没点过负样本分类
  65. uids = numpy.array([inp[0] for inp in input]) # N,用户id
  66. mids = numpy.array([inp[1] for inp in input]) # N,商品id
  67. cats = numpy.array([inp[2] for inp in input]) # N,商品分类
  68. if return_neg:
  69. return uids, mids, cats, mid_his, cat_his, mid_mask, numpy.array(target), numpy.array(lengths_x), noclk_mid_his, noclk_cat_his
  70. # uids: N, 用户id
  71. # mids: N, 商品 item id
  72. # cats: N, 商品分类
  73. # mid_his: N * maxLen_x 之前点击item id 序列
  74. # cat_his: N * maxLen_x 之前点击item 分类 序列
  75. # mid_mask: N * maxLen_x 实际之前点击序列长度
  76. # numpy.array(target): N * 2, label 正样本 [1,0] or 负样本 [0,1]
  77. # numpy.array(lengths_x):N, 实际之前点击样本序列长度
  78. # noclk_mid_his:N * maxLen_x * ngsample(5), 之前每次点击对应负样本
  79. # noclk_cat_his:N * maxLen_x * ngsample(5), 之前每次点击对应负样本分类
  80. else:
  81. return uids, mids, cats, mid_his, cat_his, mid_mask, numpy.array(target), numpy.array(lengths_x)
  82. def eval(sess, test_data, model, model_path):
  83. loss_sum = 0.
  84. accuracy_sum = 0.
  85. aux_loss_sum = 0.
  86. nums = 0
  87. stored_arr = []
  88. for src, tgt in test_data:
  89. nums += 1
  90. uids, mids, cats, mid_his, cat_his, mid_mask, target, sl, noclk_mids, noclk_cats = prepare_data(src, tgt, return_neg=True)
  91. # uids: N, 用户id
  92. # mids: N, 商品 item id
  93. # cats: N, 商品分类
  94. # mid_his: N * maxLen_x 之前点击item id 序列
  95. # cat_his: N * maxLen_x 之前点击item 分类 序列
  96. # mid_mask: N * maxLen_x 实际之前点击序列长度
  97. # target: N * 2, label 正样本 [1,0] or 负样本 [0,1]
  98. # sl:N, 实际之前点击样本序列长度
  99. # noclk_mids: N * maxLen_x * ngsample(5), 之前每次点击对应负样本
  100. # noclk_cats: N * maxLen_x * ngsample(5), 之前每次点击对应负样本分类
  101. prob, loss, acc, aux_loss = model.calculate(sess, [uids, mids, cats, mid_his, cat_his, mid_mask, target, sl, noclk_mids, noclk_cats])
  102. loss_sum += loss
  103. aux_loss_sum = aux_loss
  104. accuracy_sum += acc
  105. prob_1 = prob[:, 0].tolist()
  106. target_1 = target[:, 0].tolist()
  107. for p ,t in zip(prob_1, target_1):
  108. stored_arr.append([p, t])
  109. test_auc = calc_auc(stored_arr)
  110. accuracy_sum = accuracy_sum / nums
  111. loss_sum = loss_sum / nums
  112. aux_loss_sum / nums
  113. global best_auc
  114. if best_auc < test_auc:
  115. best_auc = test_auc
  116. model.save(sess, model_path)
  117. return test_auc, loss_sum, accuracy_sum, aux_loss_sum
  118. def train(
  119. train_file = "local_train_splitByUser",
  120. test_file = "local_test_splitByUser",
  121. #第一行:label 0, 用户id, 商品id(未点击,负样本), 商品分类, 之前点击过所有商品id,之前点击过所有商品分类
  122. #第二行:label 1, 用户id, 商品id(点击过,正样本), 商品分类, 之前点击过所有商品id,之前点击过所有商品分类
  123. uid_voc = "uid_voc.pkl",
  124. mid_voc = "mid_voc.pkl",
  125. cat_voc = "cat_voc.pkl",
  126. batch_size = 128,
  127. maxlen = 100,
  128. test_iter = 100,
  129. save_iter = 100,
  130. model_type = 'DNN',
  131. seed = 2,
  132. ):
  133. model_path = "dnn_save_path/ckpt_noshuff" + model_type + str(seed)
  134. best_model_path = "dnn_best_model/ckpt_noshuff" + model_type + str(seed)
  135. gpu_options = tf.GPUOptions(allow_growth=True)
  136. with tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) as sess:
  137. train_data = DataIterator(train_file, uid_voc, mid_voc, cat_voc, batch_size, maxlen, shuffle_each_epoch=False)
  138. test_data = DataIterator(test_file, uid_voc, mid_voc, cat_voc, batch_size, maxlen)
  139. n_uid, n_mid, n_cat = train_data.get_n() # 用户数、商品数和分类数
  140. if model_type == 'DNN':
  141. model = Model_DNN(n_uid, n_mid, n_cat, EMBEDDING_DIM, HIDDEN_SIZE, ATTENTION_SIZE)
  142. elif model_type == 'PNN':
  143. model = Model_PNN(n_uid, n_mid, n_cat, EMBEDDING_DIM, HIDDEN_SIZE, ATTENTION_SIZE)
  144. elif model_type == 'Wide':
  145. model = Model_WideDeep(n_uid, n_mid, n_cat, EMBEDDING_DIM, HIDDEN_SIZE, ATTENTION_SIZE)
  146. elif model_type == 'DIN':
  147. model = Model_DIN(n_uid, n_mid, n_cat, EMBEDDING_DIM, HIDDEN_SIZE, ATTENTION_SIZE)
  148. elif model_type == 'DIN-V2-gru-att-gru':
  149. model = Model_DIN_V2_Gru_att_Gru(n_uid, n_mid, n_cat, EMBEDDING_DIM, HIDDEN_SIZE, ATTENTION_SIZE)
  150. elif model_type == 'DIN-V2-gru-gru-att':
  151. model = Model_DIN_V2_Gru_Gru_att(n_uid, n_mid, n_cat, EMBEDDING_DIM, HIDDEN_SIZE, ATTENTION_SIZE)
  152. elif model_type == 'DIN-V2-gru-qa-attGru':
  153. model = Model_DIN_V2_Gru_QA_attGru(n_uid, n_mid, n_cat, EMBEDDING_DIM, HIDDEN_SIZE, ATTENTION_SIZE)
  154. elif model_type == 'DIN-V2-gru-vec-attGru':
  155. model = Model_DIN_V2_Gru_Vec_attGru(n_uid, n_mid, n_cat, EMBEDDING_DIM, HIDDEN_SIZE, ATTENTION_SIZE)
  156. elif model_type == 'DIEN':
  157. model = Model_DIN_V2_Gru_Vec_attGru_Neg(n_uid, n_mid, n_cat, EMBEDDING_DIM, HIDDEN_SIZE, ATTENTION_SIZE)
  158. else:
  159. print ("Invalid model_type : %s", model_type)
  160. return
  161. # model = Model_DNN(n_uid, n_mid, n_cat, EMBEDDING_DIM, HIDDEN_SIZE, ATTENTION_SIZE)
  162. sess.run(tf.global_variables_initializer())
  163. sess.run(tf.local_variables_initializer())
  164. sys.stdout.flush()
  165. print('test_auc: %.4f ---- test_loss: %.4f ---- test_accuracy: %.4f ---- test_aux_loss: %.4f' % eval(sess, test_data, model, best_model_path))
  166. sys.stdout.flush()
  167. start_time = time.time()
  168. iter = 0
  169. lr = 0.001
  170. for itr in range(3):
  171. loss_sum = 0.0
  172. accuracy_sum = 0.
  173. aux_loss_sum = 0.
  174. for src, tgt in train_data:
  175. # src : 用户id, 商品id, 商品分类,之前点过商品(n个),之前点个商品分类(n个),没点过商品(n*5个),没点过商品分类(n*5个)
  176. # label: 正样本或者负样本
  177. uids, mids, cats, mid_his, cat_his, mid_mask, target, sl, noclk_mids, noclk_cats = prepare_data(src, tgt, maxlen, return_neg=True)
  178. loss, acc, aux_loss = model.train(sess, [uids, mids, cats, mid_his, cat_his, mid_mask, target, sl, lr, noclk_mids, noclk_cats])
  179. loss_sum += loss
  180. accuracy_sum += acc
  181. aux_loss_sum += aux_loss
  182. iter += 1
  183. sys.stdout.flush()
  184. if (iter % test_iter) == 0:
  185. print('iter: %d ----> train_loss: %.4f ---- train_accuracy: %.4f ---- tran_aux_loss: %.4f' % \
  186. (iter, loss_sum / test_iter, accuracy_sum / test_iter, aux_loss_sum / test_iter))
  187. print(' test_auc: %.4f ----test_loss: %.4f ---- test_accuracy: %.4f ---- test_aux_loss: %.4f' % eval(sess, test_data, model, best_model_path))
  188. loss_sum = 0.0
  189. accuracy_sum = 0.0
  190. aux_loss_sum = 0.0
  191. if (iter % save_iter) == 0:
  192. print('save model iter: %d' %(iter))
  193. model.save(sess, model_path+"--"+str(iter))
  194. lr *= 0.5
  195. def test(
  196. train_file = "local_train_splitByUser",
  197. test_file = "local_test_splitByUser",
  198. uid_voc = "uid_voc.pkl",
  199. mid_voc = "mid_voc.pkl",
  200. cat_voc = "cat_voc.pkl",
  201. batch_size = 128,
  202. maxlen = 100,
  203. model_type = 'DNN',
  204. seed = 2
  205. ):
  206. model_path = "dnn_best_model/ckpt_noshuff" + model_type + str(seed)
  207. gpu_options = tf.GPUOptions(allow_growth=True)
  208. with tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) as sess:
  209. train_data = DataIterator(train_file, uid_voc, mid_voc, cat_voc, batch_size, maxlen)
  210. test_data = DataIterator(test_file, uid_voc, mid_voc, cat_voc, batch_size, maxlen)
  211. n_uid, n_mid, n_cat = train_data.get_n()
  212. if model_type == 'DNN':
  213. model = Model_DNN(n_uid, n_mid, n_cat, EMBEDDING_DIM, HIDDEN_SIZE, ATTENTION_SIZE)
  214. elif model_type == 'PNN':
  215. model = Model_PNN(n_uid, n_mid, n_cat, EMBEDDING_DIM, HIDDEN_SIZE, ATTENTION_SIZE)
  216. elif model_type == 'Wide':
  217. model = Model_WideDeep(n_uid, n_mid, n_cat, EMBEDDING_DIM, HIDDEN_SIZE, ATTENTION_SIZE)
  218. elif model_type == 'DIN':
  219. model = Model_DIN(n_uid, n_mid, n_cat, EMBEDDING_DIM, HIDDEN_SIZE, ATTENTION_SIZE)
  220. elif model_type == 'DIN-V2-gru-att-gru':
  221. model = Model_DIN_V2_Gru_att_Gru(n_uid, n_mid, n_cat, EMBEDDING_DIM, HIDDEN_SIZE, ATTENTION_SIZE)
  222. elif model_type == 'DIN-V2-gru-gru-att':
  223. model = Model_DIN_V2_Gru_Gru_att(n_uid, n_mid, n_cat, EMBEDDING_DIM, HIDDEN_SIZE, ATTENTION_SIZE)
  224. elif model_type == 'DIN-V2-gru-qa-attGru':
  225. model = Model_DIN_V2_Gru_QA_attGru(n_uid, n_mid, n_cat, EMBEDDING_DIM, HIDDEN_SIZE, ATTENTION_SIZE)
  226. elif model_type == 'DIN-V2-gru-vec-attGru':
  227. model = Model_DIN_V2_Gru_Vec_attGru(n_uid, n_mid, n_cat, EMBEDDING_DIM, HIDDEN_SIZE, ATTENTION_SIZE)
  228. elif model_type == 'DIEN':
  229. model = Model_DIN_V2_Gru_Vec_attGru_Neg(n_uid, n_mid, n_cat, EMBEDDING_DIM, HIDDEN_SIZE, ATTENTION_SIZE)
  230. else:
  231. print ("Invalid model_type : %s", model_type)
  232. return
  233. model.restore(sess, model_path)
  234. print('test_auc: %.4f ----test_loss: %.4f ---- test_accuracy: %.4f ---- test_aux_loss: %.4f' % eval(sess, test_data, model, model_path))
  235. if __name__ == '__main__':
  236. if len(sys.argv) == 4:
  237. SEED = int(sys.argv[3])
  238. else:
  239. SEED = 3
  240. tf.set_random_seed(SEED)
  241. numpy.random.seed(SEED)
  242. random.seed(SEED)
  243. if sys.argv[1] == 'train':
  244. train(model_type=sys.argv[2], seed=SEED)
  245. elif sys.argv[1] == 'test':
  246. test(model_type=sys.argv[2], seed=SEED)
  247. else:
  248. print('do nothing...')

训练数据获取

首先介绍下如何读取之前生成的训练数据并迭代获取这些数据生成最终的训练数据。

样本数据获取和迭代

训练和测试样本数据获取代码:

  1. train_data = DataIterator(train_file, uid_voc, mid_voc, cat_voc, batch_size, maxlen, shuffle_each_epoch=False)
  2. test_data = DataIterator(test_file, uid_voc, mid_voc, cat_voc, batch_size, maxlen)

这里通过DataIterator类来获取训练和测试样本数据,DataIterator的定义在data_iterator.py,详细注解代码如下:

  1. import numpy
  2. import json
  3. import _pickle as pkl
  4. import random
  5. import gzip
  6. import shuffle
  7. def unicode_to_utf8(d):
  8. return dict((key.encode("UTF-8"), value) for (key,value) in d.items())
  9. def load_dict(filename):
  10. try:
  11. with open(filename, 'rb') as f:
  12. return unicode_to_utf8(json.load(f))
  13. except:
  14. with open(filename, 'rb') as f:
  15. #return unicode_to_utf8(pkl.load(f))
  16. return pkl.load(f)
  17. def fopen(filename, mode='r'):
  18. if filename.endswith('.gz'):
  19. return gzip.open(filename, mode)
  20. return open(filename, mode)
  21. class DataIterator:
  22. def __init__(self, source, # local_train_splitByUser
  23. #第一行:label 0, 用户id, 商品id(未点击,负样本), 商品分类, 之前点击过所有商品id,之前点击过所有商品分类
  24. #第二行:label 1, 用户id, 商品id(点击过,正样本), 商品分类, 之前点击过所有商品id,之前点击过所有商品分类
  25. uid_voc, # 用户 id 编号,uid_voc.pkl
  26. mid_voc, # item id 编号,mid_voc.pkl
  27. cat_voc, # cat id 编号,cat_voc.pkl
  28. batch_size=128,
  29. maxlen=100,
  30. skip_empty=False,
  31. shuffle_each_epoch=False,
  32. sort_by_length=True,
  33. max_batch_size=20,
  34. minlen=None):
  35. if shuffle_each_epoch:
  36. self.source_orig = source
  37. self.source = shuffle.main(self.source_orig, temporary=True)
  38. else:
  39. self.source = fopen(source, 'r')
  40. self.source_dicts = []
  41. for source_dict in [uid_voc, mid_voc, cat_voc]:
  42. self.source_dicts.append(load_dict(source_dict)) # uid_voc, mid_voc 和 cat_voc;
  43. f_meta = open("item-info", "r")
  44. # (文件 item-info 保存字段): 商品item id, 商品分类 cat(某个名词,例如:Cables & Accessories)
  45. meta_map = {} # item id 和 商品分类 cat的映射
  46. for line in f_meta:
  47. arr = line.strip().split("\t")
  48. if arr[0] not in meta_map:
  49. meta_map[arr[0]] = arr[1]
  50. self.meta_id_map ={}
  51. for key in meta_map:
  52. val = meta_map[key] # item id 对应的 item cate
  53. if key in self.source_dicts[1]:
  54. mid_idx = self.source_dicts[1][key] # item id 对应的编号
  55. else:
  56. mid_idx = 0
  57. if val in self.source_dicts[2]:
  58. cat_idx = self.source_dicts[2][val] # cate id 对应的编号
  59. else:
  60. cat_idx = 0
  61. self.meta_id_map[mid_idx] = cat_idx #item id 编号 和 cat id 编号对应
  62. f_review = open("reviews-info", "r")
  63. #(文件reviews-info保存字段):user id, 商品item id, rating of the product(商品等级,浮点数), 时间戳
  64. self.mid_list_for_random = []
  65. for line in f_review:
  66. arr = line.strip().split("\t")
  67. tmp_idx = 0
  68. if arr[1] in self.source_dicts[1]: # mid_voc
  69. tmp_idx = self.source_dicts[1][arr[1]]
  70. self.mid_list_for_random.append(tmp_idx) # item id 的编号
  71. self.batch_size = batch_size
  72. self.maxlen = maxlen
  73. self.minlen = minlen
  74. self.skip_empty = skip_empty
  75. self.n_uid = len(self.source_dicts[0]) # 用户数
  76. self.n_mid = len(self.source_dicts[1]) # 商品 item数
  77. self.n_cat = len(self.source_dicts[2]) # 商品分类数
  78. self.shuffle = shuffle_each_epoch
  79. self.sort_by_length = sort_by_length
  80. self.source_buffer = []
  81. self.k = batch_size * max_batch_size
  82. self.end_of_data = False
  83. def get_n(self):
  84. return self.n_uid, self.n_mid, self.n_cat
  85. def __iter__(self):
  86. return self
  87. def reset(self):
  88. if self.shuffle:
  89. self.source= shuffle.main(self.source_orig, temporary=True)
  90. else:
  91. self.source.seek(0)
  92. def __next__(self):
  93. if self.end_of_data:
  94. self.end_of_data = False
  95. self.reset()
  96. raise StopIteration
  97. source = []
  98. target = []
  99. if len(self.source_buffer) == 0:
  100. for k_ in range(self.k):
  101. ss = self.source.readline()
  102. #第一行:label 0, 用户id, 商品id(未点击,负样本), 商品分类, 之前点击过所有商品id,之前点击过所有商品分类
  103. #第二行:label 1, 用户id, 商品id(点击过,正样本), 商品分类, 之前点击过所有商品id,之前点击过所有商品分类
  104. if ss == "":
  105. break
  106. self.source_buffer.append(ss.strip("\n").split("\t"))
  107. # list: label, 用户id, 商品id, 商品分类, 之前点击过所有商品id, 之前点击过所有商品分类
  108. # sort by history behavior length
  109. if self.sort_by_length: # true
  110. his_length = numpy.array([len(s[4].split("")) for s in self.source_buffer])
  111. tidx = his_length.argsort()
  112. _sbuf = [self.source_buffer[i] for i in tidx]
  113. self.source_buffer = _sbuf # 按照之前点击商品个数排序
  114. else:
  115. self.source_buffer.reverse()
  116. if len(self.source_buffer) == 0:
  117. self.end_of_data = False
  118. self.reset()
  119. raise StopIteration
  120. try:
  121. # actual work here
  122. while True:
  123. # read from source file and map to word index
  124. try:
  125. ss = self.source_buffer.pop() # label, 用户id, 商品id, 商品分类, 之前点击过所有商品id, 之前点击过所有商品分类
  126. except IndexError:
  127. break
  128. uid = self.source_dicts[0][ss[1]] if ss[1] in self.source_dicts[0] else 0 # 用户id编号
  129. mid = self.source_dicts[1][ss[2]] if ss[2] in self.source_dicts[1] else 0 # 产品id编号
  130. cat = self.source_dicts[2][ss[3]] if ss[3] in self.source_dicts[2] else 0 # 分类编号
  131. tmp = []
  132. for fea in ss[4].split(""):
  133. m = self.source_dicts[1][fea] if fea in self.source_dicts[1] else 0
  134. tmp.append(m)
  135. mid_list = tmp # 所有点击过的产品id编号
  136. tmp1 = []
  137. for fea in ss[5].split(""):
  138. c = self.source_dicts[2][fea] if fea in self.source_dicts[2] else 0
  139. tmp1.append(c)
  140. cat_list = tmp1 # 所有点击过的产品分类编号
  141. # read from source file and map to word index
  142. #if len(mid_list) > self.maxlen:
  143. # continue
  144. if self.minlen != None:
  145. if len(mid_list) <= self.minlen:
  146. continue
  147. if self.skip_empty and (not mid_list):
  148. continue
  149. noclk_mid_list = []
  150. noclk_cat_list = []
  151. for pos_mid in mid_list:
  152. noclk_tmp_mid = []
  153. noclk_tmp_cat = []
  154. noclk_index = 0
  155. while True:
  156. noclk_mid_indx = random.randint(0, len(self.mid_list_for_random)-1)
  157. noclk_mid = self.mid_list_for_random[noclk_mid_indx]
  158. if noclk_mid == pos_mid:
  159. continue
  160. noclk_tmp_mid.append(noclk_mid)
  161. noclk_tmp_cat.append(self.meta_id_map[noclk_mid])
  162. noclk_index += 1
  163. if noclk_index >= 5:
  164. break
  165. noclk_mid_list.append(noclk_tmp_mid)
  166. noclk_cat_list.append(noclk_tmp_cat)
  167. source.append([uid, mid, cat, mid_list, cat_list, noclk_mid_list, noclk_cat_list])
  168. #用户id, 商品id, 商品分类,之前点过商品(n个),之前点个商品分类(n个),没点过商品(n*5个),没点过商品分类(n*5个)
  169. target.append([float(ss[0]), 1-float(ss[0])])
  170. #label, 正样本或者负样本
  171. if len(source) >= self.batch_size or len(target) >= self.batch_size:
  172. break
  173. except IOError:
  174. self.end_of_data = True
  175. # all sentence pairs in maxibatch filtered out because of length
  176. if len(source) == 0 or len(target) == 0:
  177. source, target = self.next()
  178. return source, target
  179. #source: N个样本,对于每个样本有如下字段:
  180. # 用户id, 商品id, 商品分类,之前点过商品(n个),之前点个商品分类(n个),没点过商品(n*5个),没点过商品分类(n*5个)
  181. #label: N个样本的label: [0,1] 或者 [1,0]

分别打开训练文件local_train_splitByUser和测试文件local_test_splitByUser,之前的数据处理部分已经给出了说明,这两个文件的格式是每两行对应一个用户点击行为的正负样本对,格式如下:

  1. #第一行:label 0, 用户id, 商品id(未点击,负样本), 商品分类, 之前点击过所有商品id,之前点击过所有商品分类
  2. #第二行:label 1, 用户id, 商品id(点击过,正样本), 商品分类, 之前点击过所有商品id,之前点击过所有商品分类

uid_voc、mid_voc和cat_voc分别为用户id、商品item id 和商品分类的编号,__next__函数实现了for循环取样本的功能,具体处理流程已经给出了详细的代码注释,最终返回source和target两个变量,存储内容如下:

  1. #返回变量1,source: N个样本,对于每个样本有如下字段:
  2. # 用户id, 商品id, 商品分类,之前点过商品(n个),之前点个商品分类(n个),没点过商品(n*5个),没点过商品分类(n*5个)
  3. #返回变量2,label: N个样本的label: [0,1] 或者 [1,0]

数据准备

在获取训练和测试样本数据后,还要进一步对数据做准备和处理,准备处理部分代码如下:

  1. for src, tgt in train_data:
  2. # src : 用户id, 商品id, 商品分类,之前点过商品(n个),之前点个商品分类(n个),没点过商品(n*5个),没点过商品分类(n*5个)
  3. # label: 正样本或者负样本
  4. uids, mids, cats, mid_his, cat_his, mid_mask, target, sl, noclk_mids, noclk_cats = prepare_data(src, tgt, maxlen, return_neg=True)

主要通过prepare_data函数实现:

  1. def prepare_data(input, target, maxlen = None, return_neg = False):
  2. # x: a list of sentences
  3. # input: N个训练样本,每一行格式如下:
  4. # 用户id[0], 商品id[1], 商品分类[2],之前点过商品(n个)[3],之前点个商品分类(n个)[4],没点过商品(n*5个)[5],没点过商品分类(n*5个)[6]
  5. # label: 正样本或者负样本
  6. lengths_x = [len(s[4]) for s in input] # N, 每个样本之前点击商品的个数
  7. seqs_mid = [inp[3] for inp in input] # N*n, 之前点过商品序列
  8. seqs_cat = [inp[4] for inp in input] # N * n, 之前点过商品分类序列
  9. noclk_seqs_mid = [inp[5] for inp in input] # N * n * 5, 之前没点过商品序列
  10. noclk_seqs_cat = [inp[6] for inp in input] # N * n * 5, 之前没点过商品分类
  11. if maxlen is not None:
  12. new_seqs_mid = []
  13. new_seqs_cat = []
  14. new_noclk_seqs_mid = []
  15. new_noclk_seqs_cat = []
  16. new_lengths_x = []
  17. for l_x, inp in zip(lengths_x, input): # zip生成组元组成的list,长度与最小list长度一致
  18. if l_x > maxlen:
  19. new_seqs_mid.append(inp[3][l_x - maxlen:])
  20. new_seqs_cat.append(inp[4][l_x - maxlen:])
  21. new_noclk_seqs_mid.append(inp[5][l_x - maxlen:])
  22. new_noclk_seqs_cat.append(inp[6][l_x - maxlen:])
  23. new_lengths_x.append(maxlen)
  24. else:
  25. new_seqs_mid.append(inp[3])
  26. new_seqs_cat.append(inp[4])
  27. new_noclk_seqs_mid.append(inp[5])
  28. new_noclk_seqs_cat.append(inp[6])
  29. new_lengths_x.append(l_x)
  30. lengths_x = new_lengths_x
  31. seqs_mid = new_seqs_mid
  32. seqs_cat = new_seqs_cat
  33. noclk_seqs_mid = new_noclk_seqs_mid
  34. noclk_seqs_cat = new_noclk_seqs_cat
  35. if len(lengths_x) < 1:
  36. return None, None, None, None
  37. n_samples = len(seqs_mid) # 样本数 N
  38. maxlen_x = numpy.max(lengths_x) # 之前最多的点击样本个数;
  39. if maxlen_x <= 1:
  40. maxlen_x = 2
  41. neg_samples = len(noclk_seqs_mid[0][0]) # 每一次之前点击行为对应的负样本个数
  42. mid_his = numpy.zeros((n_samples, maxlen_x)).astype('int64') # N * maxLen_x 之前点击item id 序列
  43. cat_his = numpy.zeros((n_samples, maxlen_x)).astype('int64') # N * maxLen_x 之前点击item 分类 序列
  44. noclk_mid_his = numpy.zeros((n_samples, maxlen_x, neg_samples)).astype('int64') # N * maxLen_x * ngsample(5), 之前每次点击对应负样本
  45. noclk_cat_his = numpy.zeros((n_samples, maxlen_x, neg_samples)).astype('int64') # N * maxLen_x * ngsample(5), 之前每次点击对应负样本分类
  46. mid_mask = numpy.zeros((n_samples, maxlen_x)).astype('float32') # N * maxLen_x 实际之前点击序列长度
  47. for idx, [s_x, s_y, no_sx, no_sy] in enumerate(zip(seqs_mid, seqs_cat, noclk_seqs_mid, noclk_seqs_cat)):
  48. mid_mask[idx, :lengths_x[idx]] = 1. # 第idx个样本,前lengths_x[idx]置为1,即有点击的位置置为1.
  49. mid_his[idx, :lengths_x[idx]] = s_x # 第idx个样本,之前点过的商品id序列
  50. cat_his[idx, :lengths_x[idx]] = s_y # 第idx个样本,之前点过的商品分类序列
  51. noclk_mid_his[idx, :lengths_x[idx], :] = no_sx # 第idx个样本,没点过负样本id
  52. noclk_cat_his[idx, :lengths_x[idx], :] = no_sy # 第idx个样本,没点过负样本分类
  53. uids = numpy.array([inp[0] for inp in input]) # N,用户id
  54. mids = numpy.array([inp[1] for inp in input]) # N,商品id
  55. cats = numpy.array([inp[2] for inp in input]) # N,商品分类
  56. if return_neg:
  57. return uids, mids, cats, mid_his, cat_his, mid_mask, numpy.array(target), numpy.array(lengths_x), noclk_mid_his, noclk_cat_his
  58. # uids: N, 用户id
  59. # mids: N, 商品 item id
  60. # cats: N, 商品分类
  61. # mid_his: N * maxLen_x 之前点击item id 序列
  62. # cat_his: N * maxLen_x 之前点击item 分类 序列
  63. # mid_mask: N * maxLen_x 实际之前点击序列长度
  64. # numpy.array(target): N * 2, label 正样本 [1,0] or 负样本 [0,1]
  65. # numpy.array(lengths_x):N, 实际之前点击样本序列长度
  66. # noclk_mid_his:N * maxLen_x * ngsample(5), 之前每次点击对应负样本
  67. # noclk_cat_his:N * maxLen_x * ngsample(5), 之前每次点击对应负样本分类
  68. else:
  69. return uids, mids, cats, mid_his, cat_his, mid_mask, numpy.array(target), numpy.array(lengths_x)

max_len = 100,表示用户历史点击商品的截断长度为100,即最长历史击样本序列长度为100,代码同样给出了详细注释,最终返回变量:uids, mids, cats, mid_his, cat_his, mid_mask, target, sl, noclk_mids, noclk_cats,定义如下:

  1. # uids: N, 用户id
  2. # mids: N, 商品 item id
  3. # cats: N, 商品分类
  4. # mid_his: N * maxLen_x 之前点击item id 序列
  5. # cat_his: N * maxLen_x 之前点击item 分类 序列
  6. # mid_mask: N * maxLen_x 实际之前点击序列长度
  7. # target: N * 2, label 正样本 [1,0] or 负样本 [0,1]
  8. # sl:N, 实际之前点击样本序列长度
  9. # noclk_mids: N * maxLen_x * ngsample(5), 之前每次点击对应负样本
  10. # noclk_cats: N * maxLen_x * ngsample(5), 之前每次点击对应负样本分类

这是模型训练需要的全部数据。

模型结构

模型定义代码如下:

  1. model = Model_DIN_V2_Gru_Vec_attGru_Neg(n_uid, n_mid, n_cat, EMBEDDING_DIM, HIDDEN_SIZE, ATTENTION_SIZE)

基础网络结构

网络结构类Model_DIN_V2_Gru_Vec_attGru_Neg定义在model.py文件:

  1. class Model_DIN_V2_Gru_Vec_attGru_Neg(Model):
  2. def __init__(self, n_uid, n_mid, n_cat, EMBEDDING_DIM, HIDDEN_SIZE, ATTENTION_SIZE, use_negsampling=True):
  3. # 用户id数,商品id数,商品分类数,18,18 * 2,18 * 2
  4. super(Model_DIN_V2_Gru_Vec_attGru_Neg, self).__init__(n_uid, n_mid, n_cat,
  5. EMBEDDING_DIM, HIDDEN_SIZE, ATTENTION_SIZE,
  6. use_negsampling)
  7. # RNN layer(-s)
  8. with tf.name_scope('rnn_1'):
  9. # item_his_eb: 之前点过商品embedding 和 分类embedding拼接在一起,[batch_size, n, EMBEDDING_DIM * 2]
  10. # [batch_size,], 实际之前点击样本序列长度
  11. # HIDDEN_SIZE 32
  12. rnn_outputs, _ = dynamic_rnn(GRUCell(HIDDEN_SIZE), inputs=self.item_his_eb,
  13. sequence_length=self.seq_len_ph, dtype=tf.float32,
  14. scope="gru1")
  15. # [batch_size, n, HIDDEN_SIZE]
  16. tf.summary.histogram('GRU_outputs', rnn_outputs)
  17. # rnn_outputs[:, :-1, :]:上一时刻embedding特征,[batch_size, n - 1, HIDDEN_SIZE]
  18. # item_his_eb[:, 1:, :]:当前时刻的embedding特征,[batch_size, n - 1, EMBEDDING_DIM * 2]
  19. # noclk_item_his_eb[:, 1:, :]:每次点击行为取第0个为负样本,[batch_size, n - 1, EMBEDDING_DIM * 2]
  20. # 每一个样本,有效的点击序列个数:[batch_size, n - 1]
  21. aux_loss_1 = self.auxiliary_loss(rnn_outputs[:, :-1, :], self.item_his_eb[:, 1:, :],
  22. self.noclk_item_his_eb[:, 1:, :],
  23. self.mask[:, 1:], stag="gru")
  24. self.aux_loss = aux_loss_1
  25. # Attention layer
  26. with tf.name_scope('Attention_layer_1'):
  27. # item_eb:mid embedding 和 cat embedding拼接在一起,[batch_size, EMBEDDING_DIM * 2]
  28. # rnn_outputs:GRU1输出的用户兴趣状态,[batch_size, n, HIDDEN_SIZE]
  29. att_outputs, alphas = din_fcn_attention(self.item_eb, rnn_outputs, ATTENTION_SIZE, self.mask,
  30. softmax_stag=1, stag='1_1', mode='LIST', return_alphas=True)
  31. # 输出:ouput: [batch_size, n, HIDDEN_SIZE]
  32. # 每个样本用户之前每一个行为兴趣特征和当前item的权重,即注意力分数:scores: [batch_size, n]
  33. tf.summary.histogram('alpha_outputs', alphas)
  34. with tf.name_scope('rnn_2'):
  35. rnn_outputs2, final_state2 = dynamic_rnn(VecAttGRUCell(HIDDEN_SIZE), inputs=rnn_outputs,
  36. att_scores = tf.expand_dims(alphas, -1),
  37. sequence_length=self.seq_len_ph, dtype=tf.float32,
  38. scope="gru2")
  39. # 实现AUGRU,输出:[batch_size, HIDDEN_SIZE]
  40. tf.summary.histogram('GRU2_Final_State', final_state2)
  41. inp = tf.concat([self.uid_batch_embedded, self.item_eb, self.item_his_eb_sum, self.item_eb * self.item_his_eb_sum, final_state2], 1)
  42. # uid_batch_embedded: [batch_size, EMBEDDING_DIM]用户特征embedding
  43. # item_eb: mid embedding 和 cat embedding拼接在一起,[batch_size, EMBEDDING_DIM * 2]
  44. # item_his_eb_sum:之前行为embedding求和[batch_size, EMBEDDING_DIM * 2]
  45. # final_state2:attention兴趣层提提权求和
  46. # 所有特征拼接的一起,送入全连接网络
  47. self.build_fcn_net(inp, use_dice=True)

其父类为model类,定义同样在model.py

  1. class Model(object):
  2. def __init__(self, n_uid, n_mid, n_cat, EMBEDDING_DIM, HIDDEN_SIZE, ATTENTION_SIZE, use_negsampling = False):
  3. with tf.name_scope('Inputs'):
  4. self.mid_his_batch_ph = tf.placeholder(tf.int32, [None, None], name='mid_his_batch_ph')
  5. self.cat_his_batch_ph = tf.placeholder(tf.int32, [None, None], name='cat_his_batch_ph')
  6. self.uid_batch_ph = tf.placeholder(tf.int32, [None, ], name='uid_batch_ph')
  7. self.mid_batch_ph = tf.placeholder(tf.int32, [None, ], name='mid_batch_ph')
  8. self.cat_batch_ph = tf.placeholder(tf.int32, [None, ], name='cat_batch_ph')
  9. self.mask = tf.placeholder(tf.float32, [None, None], name='mask')
  10. self.seq_len_ph = tf.placeholder(tf.int32, [None], name='seq_len_ph')
  11. self.target_ph = tf.placeholder(tf.float32, [None, None], name='target_ph')
  12. self.lr = tf.placeholder(tf.float64, [])
  13. self.use_negsampling =use_negsampling
  14. if use_negsampling:
  15. self.noclk_mid_batch_ph = tf.placeholder(tf.int32, [None, None, None], name='noclk_mid_batch_ph') #generate 3 item IDs from negative sampling.
  16. self.noclk_cat_batch_ph = tf.placeholder(tf.int32, [None, None, None], name='noclk_cat_batch_ph')
  17. # Embedding layer
  18. with tf.name_scope('Embedding_layer'):
  19. # uid embedding 层
  20. self.uid_embeddings_var = tf.get_variable("uid_embedding_var", [n_uid, EMBEDDING_DIM])
  21. tf.summary.histogram('uid_embeddings_var', self.uid_embeddings_var)
  22. self.uid_batch_embedded = tf.nn.embedding_lookup(self.uid_embeddings_var, self.uid_batch_ph)
  23. #mid embedding 层
  24. self.mid_embeddings_var = tf.get_variable("mid_embedding_var", [n_mid, EMBEDDING_DIM])
  25. tf.summary.histogram('mid_embeddings_var', self.mid_embeddings_var)
  26. self.mid_batch_embedded = tf.nn.embedding_lookup(self.mid_embeddings_var, self.mid_batch_ph)
  27. self.mid_his_batch_embedded = tf.nn.embedding_lookup(self.mid_embeddings_var, self.mid_his_batch_ph)
  28. if self.use_negsampling:
  29. self.noclk_mid_his_batch_embedded = tf.nn.embedding_lookup(self.mid_embeddings_var, self.noclk_mid_batch_ph)
  30. # [batch_size, n, 5, EMBEDDING_DIM]
  31. # cat embedding 层
  32. self.cat_embeddings_var = tf.get_variable("cat_embedding_var", [n_cat, EMBEDDING_DIM])
  33. tf.summary.histogram('cat_embeddings_var', self.cat_embeddings_var)
  34. self.cat_batch_embedded = tf.nn.embedding_lookup(self.cat_embeddings_var, self.cat_batch_ph)
  35. self.cat_his_batch_embedded = tf.nn.embedding_lookup(self.cat_embeddings_var, self.cat_his_batch_ph)
  36. if self.use_negsampling:
  37. self.noclk_cat_his_batch_embedded = tf.nn.embedding_lookup(self.cat_embeddings_var, self.noclk_cat_batch_ph)
  38. # [batch_size, n, 5, EMBEDDING_DIM]
  39. self.item_eb = tf.concat([self.mid_batch_embedded, self.cat_batch_embedded], 1)
  40. # mid embedding 和 cat embedding拼接在一起,[batch_size, EMBEDDING_DIM * 2]
  41. self.item_his_eb = tf.concat([self.mid_his_batch_embedded, self.cat_his_batch_embedded], 2)
  42. # 之前点过商品embedding 和 分类embedding拼接在一起,[batch_size, n, EMBEDDING_DIM * 2]
  43. self.item_his_eb_sum = tf.reduce_sum(self.item_his_eb, 1) # 之前行为embedding求和[batch_size, EMBEDDING_DIM * 2]
  44. if self.use_negsampling:
  45. self.noclk_item_his_eb = tf.concat(
  46. [self.noclk_mid_his_batch_embedded[:, :, 0, :], self.noclk_cat_his_batch_embedded[:, :, 0, :]], -1)# 0 means only using the first negative item ID. 3 item IDs are inputed in the line 24.
  47. # 每次点击行为取第0个为负样本,[batch_size, n, EMBEDDING_DIM * 2]
  48. self.noclk_item_his_eb = tf.reshape(self.noclk_item_his_eb,
  49. [-1, tf.shape(self.noclk_mid_his_batch_embedded)[1], 36])# cat embedding 18 concate item embedding 18.
  50. self.noclk_his_eb = tf.concat([self.noclk_mid_his_batch_embedded, self.noclk_cat_his_batch_embedded], -1)
  51. # [batch_size, n, 5, EMBEDDING_DIM * 2]
  52. self.noclk_his_eb_sum_1 = tf.reduce_sum(self.noclk_his_eb, 2)
  53. # [batch_size, n, EMBEDDING_DIM * 2]
  54. self.noclk_his_eb_sum = tf.reduce_sum(self.noclk_his_eb_sum_1, 1)
  55. # [batch_size, EMBEDDING_DIM * 2]
  56. def build_fcn_net(self, inp, use_dice = False):
  57. bn1 = tf.layers.batch_normalization(inputs=inp, name='bn1')
  58. dnn1 = tf.layers.dense(bn1, 200, activation=None, name='f1')
  59. if use_dice:
  60. dnn1 = dice(dnn1, name='dice_1')
  61. else:
  62. dnn1 = prelu(dnn1, 'prelu1')
  63. dnn2 = tf.layers.dense(dnn1, 80, activation=None, name='f2')
  64. if use_dice:
  65. dnn2 = dice(dnn2, name='dice_2')
  66. else:
  67. dnn2 = prelu(dnn2, 'prelu2')
  68. dnn3 = tf.layers.dense(dnn2, 2, activation=None, name='f3')
  69. self.y_hat = tf.nn.softmax(dnn3) + 0.00000001
  70. with tf.name_scope('Metrics'):
  71. # Cross-entropy loss and optimizer initialization
  72. ctr_loss = - tf.reduce_mean(tf.log(self.y_hat) * self.target_ph)
  73. self.loss = ctr_loss
  74. if self.use_negsampling:
  75. self.loss += self.aux_loss
  76. tf.summary.scalar('loss', self.loss)
  77. self.optimizer = tf.train.AdamOptimizer(learning_rate=self.lr).minimize(self.loss)
  78. # Accuracy metric
  79. self.accuracy = tf.reduce_mean(tf.cast(tf.equal(tf.round(self.y_hat), self.target_ph), tf.float32))
  80. tf.summary.scalar('accuracy', self.accuracy)
  81. self.merged = tf.summary.merge_all()
  82. def auxiliary_loss(self, h_states, click_seq, noclick_seq, mask, stag = None):
  83. # h_states:上一时刻embedding特征,[batch_size, n - 1, HIDDEN_SIZE]
  84. # click_seq: 当前时刻的embedding特征,[batch_size, n - 1, HIDDEN_SIZE]
  85. # noclick_seq: 每次点击行为取第0个为负样本,[batch_size, n - 1, EMBEDDING_DIM * 2]
  86. # mask: 每一个样本,有效的点击序列个数,[batch_size, n - 1]
  87. mask = tf.cast(mask, tf.float32)
  88. click_input_ = tf.concat([h_states, click_seq], -1) # [batch_size, n - 1, HIDDEN_SIZE * 2]
  89. noclick_input_ = tf.concat([h_states, noclick_seq], -1) # [batch_size, n - 1, HIDDEN_SIZE * 2]
  90. click_prop_ = self.auxiliary_net(click_input_, stag = stag)[:, :, 0] # [batch_size, n - 1]
  91. noclick_prop_ = self.auxiliary_net(noclick_input_, stag = stag)[:, :, 0] # [batch_size, n - 1]
  92. click_loss_ = - tf.reshape(tf.log(click_prop_), [-1, tf.shape(click_seq)[1]]) * mask
  93. noclick_loss_ = - tf.reshape(tf.log(1.0 - noclick_prop_), [-1, tf.shape(noclick_seq)[1]]) * mask
  94. loss_ = tf.reduce_mean(click_loss_ + noclick_loss_)
  95. return loss_
  96. def auxiliary_net(self, in_, stag='auxiliary_net'):
  97. # [batch_size, n - 1, HIDDEN_SIZE * 2]
  98. bn1 = tf.layers.batch_normalization(inputs=in_, name='bn1' + stag, reuse=tf.AUTO_REUSE)
  99. dnn1 = tf.layers.dense(bn1, 100, activation=None, name='f1' + stag, reuse=tf.AUTO_REUSE)
  100. dnn1 = tf.nn.sigmoid(dnn1)
  101. dnn2 = tf.layers.dense(dnn1, 50, activation=None, name='f2' + stag, reuse=tf.AUTO_REUSE)
  102. dnn2 = tf.nn.sigmoid(dnn2)
  103. dnn3 = tf.layers.dense(dnn2, 2, activation=None, name='f3' + stag, reuse=tf.AUTO_REUSE)
  104. y_hat = tf.nn.softmax(dnn3) + 0.00000001
  105. return y_hat
  106. def train(self, sess, inps):
  107. if self.use_negsampling:
  108. loss, accuracy, aux_loss, _ = sess.run([self.loss, self.accuracy, self.aux_loss, self.optimizer], feed_dict={
  109. self.uid_batch_ph: inps[0],
  110. self.mid_batch_ph: inps[1],
  111. self.cat_batch_ph: inps[2],
  112. self.mid_his_batch_ph: inps[3],
  113. self.cat_his_batch_ph: inps[4],
  114. self.mask: inps[5],
  115. self.target_ph: inps[6],
  116. self.seq_len_ph: inps[7],
  117. self.lr: inps[8],
  118. self.noclk_mid_batch_ph: inps[9],
  119. self.noclk_cat_batch_ph: inps[10],
  120. })
  121. return loss, accuracy, aux_loss
  122. else:
  123. loss, accuracy, _ = sess.run([self.loss, self.accuracy, self.optimizer], feed_dict={
  124. self.uid_batch_ph: inps[0],
  125. self.mid_batch_ph: inps[1],
  126. self.cat_batch_ph: inps[2],
  127. self.mid_his_batch_ph: inps[3],
  128. self.cat_his_batch_ph: inps[4],
  129. self.mask: inps[5],
  130. self.target_ph: inps[6],
  131. self.seq_len_ph: inps[7],
  132. self.lr: inps[8],
  133. })
  134. return loss, accuracy, 0
  135. def calculate(self, sess, inps):
  136. if self.use_negsampling:
  137. probs, loss, accuracy, aux_loss = sess.run([self.y_hat, self.loss, self.accuracy, self.aux_loss], feed_dict={
  138. self.uid_batch_ph: inps[0], # [uids[0], mids[1], cats[2], mid_his[3], cat_his[4], mid_mask[5], target[6], sl, noclk_mids, noclk_cats]
  139. self.mid_batch_ph: inps[1],
  140. self.cat_batch_ph: inps[2],
  141. self.mid_his_batch_ph: inps[3],
  142. self.cat_his_batch_ph: inps[4],
  143. self.mask: inps[5],
  144. self.target_ph: inps[6],
  145. self.seq_len_ph: inps[7],
  146. self.noclk_mid_batch_ph: inps[8],
  147. self.noclk_cat_batch_ph: inps[9],
  148. })
  149. return probs, loss, accuracy, aux_loss
  150. else:
  151. probs, loss, accuracy = sess.run([self.y_hat, self.loss, self.accuracy], feed_dict={
  152. self.uid_batch_ph: inps[0],
  153. self.mid_batch_ph: inps[1],
  154. self.cat_batch_ph: inps[2],
  155. self.mid_his_batch_ph: inps[3],
  156. self.cat_his_batch_ph: inps[4],
  157. self.mask: inps[5],
  158. self.target_ph: inps[6],
  159. self.seq_len_ph: inps[7]
  160. })
  161. return probs, loss, accuracy, 0
  162. def save(self, sess, path):
  163. saver = tf.train.Saver()
  164. saver.save(sess, save_path=path)
  165. def restore(self, sess, path):
  166. saver = tf.train.Saver()
  167. saver.restore(sess, save_path=path)
  168. print('model restored from %s' % path)

兴趣提取层(Interest Extractor Layer)

兴趣提取层部分代码实现如下:

  1. # RNN layer(-s)
  2. with tf.name_scope('rnn_1'):
  3. # item_his_eb: 之前点过商品embedding 和 分类embedding拼接在一起,[batch_size, n, EMBEDDING_DIM * 2]
  4. # [batch_size,], 实际之前点击样本序列长度
  5. # HIDDEN_SIZE 32
  6. rnn_outputs, _ = dynamic_rnn(GRUCell(HIDDEN_SIZE), inputs=self.item_his_eb,
  7. sequence_length=self.seq_len_ph, dtype=tf.float32,
  8. scope="gru1")
  9. # [batch_size, n, HIDDEN_SIZE]
  10. tf.summary.histogram('GRU_outputs', rnn_outputs)
  11. # rnn_outputs[:, :-1, :]:上一时刻embedding特征,[batch_size, n - 1, HIDDEN_SIZE]
  12. # item_his_eb[:, 1:, :]:当前时刻的embedding特征,[batch_size, n - 1, EMBEDDING_DIM * 2]
  13. # noclk_item_his_eb[:, 1:, :]:每次点击行为取第0个为负样本,[batch_size, n - 1, EMBEDDING_DIM * 2]
  14. # 每一个样本,有效的点击序列个数:[batch_size, n - 1]
  15. aux_loss_1 = self.auxiliary_loss(rnn_outputs[:, :-1, :], self.item_his_eb[:, 1:, :],
  16. self.noclk_item_his_eb[:, 1:, :],
  17. self.mask[:, 1:], stag="gru")
  18. self.aux_loss = aux_loss_1

可以看到,首先将用户之前点击item 的embedding送入以GRU为核的rnn结构中,获取每一步的兴趣状态向量rnn_outputs。

接着将0到n-2时刻的rnn_outputs[:, :-1, :],1到n-1时刻(早一个时刻)的点击商品embedding向量item_his_eb[:, 1:, :]对应负样本点击商品embedding向量noclk_item_his_eb[:, 1:, :]以及标识历史点击序列长度mask[:, 1:]送入auxiliary loss函数,auxiliary loss的实现代码如下:

  1. def auxiliary_loss(self, h_states, click_seq, noclick_seq, mask, stag = None):
  2. # h_states:上一时刻embedding特征,[batch_size, n - 1, HIDDEN_SIZE]
  3. # click_seq: 当前时刻的embedding特征,[batch_size, n - 1, HIDDEN_SIZE]
  4. # noclick_seq: 每次点击行为取第0个为负样本,[batch_size, n - 1, EMBEDDING_DIM * 2]
  5. # mask: 每一个样本,有效的点击序列个数,[batch_size, n - 1]
  6. mask = tf.cast(mask, tf.float32)
  7. click_input_ = tf.concat([h_states, click_seq], -1) # [batch_size, n - 1, HIDDEN_SIZE * 2]
  8. noclick_input_ = tf.concat([h_states, noclick_seq], -1) # [batch_size, n - 1, HIDDEN_SIZE * 2]
  9. click_prop_ = self.auxiliary_net(click_input_, stag = stag)[:, :, 0] # [batch_size, n - 1]
  10. noclick_prop_ = self.auxiliary_net(noclick_input_, stag = stag)[:, :, 0] # [batch_size, n - 1]
  11. click_loss_ = - tf.reshape(tf.log(click_prop_), [-1, tf.shape(click_seq)[1]]) * mask
  12. noclick_loss_ = - tf.reshape(tf.log(1.0 - noclick_prop_), [-1, tf.shape(noclick_seq)[1]]) * mask
  13. loss_ = tf.reduce_mean(click_loss_ + noclick_loss_)
  14. return loss_

第一层rnn返回的状态序列rnn_outputs为模型化兴趣变化过程的兴趣序列

兴趣进化层 (Interest Evolving Layer)实现

兴趣进化层 (Interest Evolving Layer)实现代码如下:

  1. # Attention layer
  2. with tf.name_scope('Attention_layer_1'):
  3. # item_eb:mid embedding 和 cat embedding拼接在一起,[batch_size, EMBEDDING_DIM * 2]
  4. # rnn_outputs:GRU1输出的用户兴趣状态,[batch_size, n, HIDDEN_SIZE]
  5. att_outputs, alphas = din_fcn_attention(self.item_eb, rnn_outputs, ATTENTION_SIZE, self.mask,
  6. softmax_stag=1, stag='1_1', mode='LIST', return_alphas=True)
  7. # 输出:ouput: [batch_size, n, HIDDEN_SIZE]
  8. # 每个样本用户之前每一个行为兴趣特征和当前item的权重,即注意力分数:scores: [batch_size, n]
  9. tf.summary.histogram('alpha_outputs', alphas)
  10. with tf.name_scope('rnn_2'):
  11. rnn_outputs2, final_state2 = dynamic_rnn(VecAttGRUCell(HIDDEN_SIZE), inputs=rnn_outputs,
  12. att_scores = tf.expand_dims(alphas, -1),
  13. sequence_length=self.seq_len_ph, dtype=tf.float32,
  14. scope="gru2")
  15. # 实现AUGRU,输出:[batch_size, HIDDEN_SIZE]
  16. tf.summary.histogram('GRU2_Final_State', final_state2)
  17. inp = tf.concat([self.uid_batch_embedded, self.item_eb, self.item_his_eb_sum, self.item_eb * self.item_his_eb_sum, final_state2], 1)
  18. # uid_batch_embedded: [batch_size, EMBEDDING_DIM]用户特征embedding
  19. # item_eb: mid embedding 和 cat embedding拼接在一起,[batch_size, EMBEDDING_DIM * 2]
  20. # item_his_eb_sum:之前行为embedding求和[batch_size, EMBEDDING_DIM * 2]
  21. # final_state2:attention兴趣层提提权求和
  22. # 所有特征拼接的一起,送入全连接网络
  23. self.build_fcn_net(inp, use_dice=True)

attention层,输入每个样本用户的兴趣变化序列rnn_outputs和当前item的embedding item_eb,利用attention机制获取每个样本用户之前每一个行为兴趣特征和当前item的权重,返回即注意力分数:scores[batch_size, n],attention层代码实现如下:

  1. def din_fcn_attention(query, facts, attention_size, mask, stag='null', mode='SUM', softmax_stag=1, time_major=False, return_alphas=False, forCnn=False):
  2. # query:mid embedding 和 cat embedding拼接在一起,[batch_size, EMBEDDING_DIM * 2]
  3. # facts:GRU1输出的用户兴趣状态,[batch_size, n, HIDDEN_SIZE]
  4. # attention_size:36
  5. # mask:每一个样本,有效的点击序列个数,[batch_size, n]
  6. if isinstance(facts, tuple):
  7. # In case of Bi-RNN, concatenate the forward and the backward RNN outputs.
  8. facts = tf.concat(facts, 2)
  9. if len(facts.get_shape().as_list()) == 2:
  10. facts = tf.expand_dims(facts, 1)
  11. if time_major:
  12. # (T,B,D) => (B,T,D)
  13. facts = tf.array_ops.transpose(facts, [1, 0, 2])
  14. # Trainable parameters
  15. mask = tf.equal(mask, tf.ones_like(mask))
  16. facts_size = facts.get_shape().as_list()[-1] # D value - hidden size of the RNN layer
  17. # 上一层GRU(GRU1)的输出状态,即此层GRU(GRU2)的输入的维度大小:HIDDEN_SIZE。
  18. querry_size = query.get_shape().as_list()[-1]
  19. # 推荐商品 item id的embedding维度:EMBEDDING_DIM * 2。
  20. query = tf.layers.dense(query, facts_size, activation=None, name='f1' + stag)
  21. # 全连接,输入:[batch_size, EMBEDDING_DIM * 2] 输出:[batch_size, HIDDEN_SIZE]
  22. query = prelu(query)
  23. # prelu 非线性变换函数
  24. queries = tf.tile(query, [1, tf.shape(facts)[1]])
  25. # queries的1维不变,2维扩展为之前的n倍,即维度变为:[batch_size, HIDDEN_SIZE * n],,对于一个推荐商品,其会生成n个重复的相同一个推荐商品的embedding向量
  26. queries = tf.reshape(queries, tf.shape(facts))
  27. # 维度进一步变为:[batch_size, n, HIDDEN_SIZE]
  28. din_all = tf.concat([queries, facts, queries-facts, queries*facts], axis=-1)
  29. # # 最后一个维度拼接到一起,拼接后变为 [batch_size, n, 4 * HIDDEN_SIZE]
  30. d_layer_1_all = tf.layers.dense(din_all, 80, activation=tf.nn.sigmoid, name='f1_att' + stag)
  31. # 第一层网络,输出[batch_size, n, 80]
  32. d_layer_2_all = tf.layers.dense(d_layer_1_all, 40, activation=tf.nn.sigmoid, name='f2_att' + stag)
  33. # 第二层网络,输出[batch_size, n, 40]
  34. d_layer_3_all = tf.layers.dense(d_layer_2_all, 1, activation=None, name='f3_att' + stag)
  35. # 第三层网络,输出[batch_size, n, 1]
  36. d_layer_3_all = tf.reshape(d_layer_3_all, [-1, 1, tf.shape(facts)[1]])
  37. scores = d_layer_3_all
  38. # 最后的输出为[batch_size, 1, n]
  39. # Mask
  40. # key_masks = tf.sequence_mask(facts_length, tf.shape(facts)[1]) # [B, T]
  41. key_masks = tf.expand_dims(mask, 1) # [B, 1, T]
  42. # 标识矩阵B * T个点位,哪些是true (存在之前点击过的商品)哪些是false(不存在之前点击过的商品)
  43. #例如:tf.sequence_mask([1, 3, 2], 5),返回值为:
  44. # [[True, False, False, False, False],
  45. # [True, True, True, False, False],
  46. # [True, True, False, False, False]]
  47. paddings = tf.ones_like(scores) * (-2 ** 32 + 1)
  48. if not forCnn:
  49. scores = tf.where(key_masks, scores, paddings) # [B, 1, T]
  50. # Scale
  51. # scores = scores / (facts.get_shape().as_list()[-1] ** 0.5)
  52. # Activation
  53. if softmax_stag:
  54. scores = tf.nn.softmax(scores) # [B, 1, T]
  55. # [batch_size, 1, n]
  56. # Weighted sum
  57. if mode == 'SUM':
  58. output = tf.matmul(scores, facts) # [B, 1, H]
  59. # output = tf.reshape(output, [-1, tf.shape(facts)[-1]])
  60. else:
  61. scores = tf.reshape(scores, [-1, tf.shape(facts)[1]])
  62. # 纬度变为: [batch_size, n], 表示没个样本每个行为的权重
  63. output = facts * tf.expand_dims(scores, -1)
  64. # [batch_size, n, HIDDEN_SIZE] * [batch_size, n, 1]
  65. output = tf.reshape(output, tf.shape(facts))
  66. # [batch_size, n, HIDDEN_SIZE]
  67. if return_alphas:
  68. return output, scores
  69. return output

ouput为每个历史点击商品embedding取attention打分加权后的结果,scores为这个batch的用户之前点过商品embedding的注意力分数,这里的代码和之前介绍的DIN代码实现中attention层代码是类似的。

AUGRU层实现,AUGRU层具体原理可以参考专题1,代码如下:

  1. with tf.name_scope('rnn_2'):
  2. rnn_outputs2, final_state2 = dynamic_rnn(VecAttGRUCell(HIDDEN_SIZE), inputs=rnn_outputs,
  3. att_scores = tf.expand_dims(alphas, -1),
  4. sequence_length=self.seq_len_ph, dtype=tf.float32,
  5. scope="gru2")
  6. # 实现AUGRU,输出:[batch_size, HIDDEN_SIZE]
  7. tf.summary.histogram('GRU2_Final_State', final_state2)
  8. inp = tf.concat([self.uid_batch_embedded, self.item_eb, self.item_his_eb_sum, self.item_eb * self.item_his_eb_sum, final_state2], 1)
  9. # uid_batch_embedded: [batch_size, EMBEDDING_DIM]用户特征embedding
  10. # item_eb: mid embedding 和 cat embedding拼接在一起,[batch_size, EMBEDDING_DIM * 2]
  11. # item_his_eb_sum:之前行为embedding求和[batch_size, EMBEDDING_DIM * 2]
  12. # final_state2:attention兴趣层提提权求和
  13. # 所有特征拼接的一起,送入全连接网络

dynamic_rnn的代码实现是在rnn.py,这里实现了AUGRU的核心功能,返回最终attention兴趣提权求和的状态特征final_state2,最后将所有特征拼到一起送入全连接层:

  1. def build_fcn_net(self, inp, use_dice = False):
  2. bn1 = tf.layers.batch_normalization(inputs=inp, name='bn1')
  3. dnn1 = tf.layers.dense(bn1, 200, activation=None, name='f1')
  4. if use_dice:
  5. dnn1 = dice(dnn1, name='dice_1')
  6. else:
  7. dnn1 = prelu(dnn1, 'prelu1')
  8. dnn2 = tf.layers.dense(dnn1, 80, activation=None, name='f2')
  9. if use_dice:
  10. dnn2 = dice(dnn2, name='dice_2')
  11. else:
  12. dnn2 = prelu(dnn2, 'prelu2')
  13. dnn3 = tf.layers.dense(dnn2, 2, activation=None, name='f3')
  14. self.y_hat = tf.nn.softmax(dnn3) + 0.00000001
  15. with tf.name_scope('Metrics'):
  16. # Cross-entropy loss and optimizer initialization
  17. ctr_loss = - tf.reduce_mean(tf.log(self.y_hat) * self.target_ph)
  18. self.loss = ctr_loss
  19. if self.use_negsampling:
  20. self.loss += self.aux_loss
  21. tf.summary.scalar('loss', self.loss)
  22. self.optimizer = tf.train.AdamOptimizer(learning_rate=self.lr).minimize(self.loss)
  23. # Accuracy metric
  24. self.accuracy = tf.reduce_mean(tf.cast(tf.equal(tf.round(self.y_hat), self.target_ph), tf.float32))
  25. tf.summary.scalar('accuracy', self.accuracy)
  26. self.merged = tf.summary.merge_all()

ctr_loss和aux_loss求和得到最终的loss函数,通过优化器来优化即可完成训练。

训练模型

执行:

  1. train.py train DIEN

可以实现DIEN的训练,成功后会打印如下结果:

当然其他网络结构也有实现代码,这个不在本节介绍的范围,会在其他文章中介绍。


本文转载自: https://blog.csdn.net/fangfanglovezhou/article/details/122972650
版权归原作者 I_belong_to_jesus 所有, 如有侵权,请联系我们删除。

“Deep Interest Evolution Network(DIEN)专题3:代码解析之模型训练和模型结构”的评论:

还没有评论