OGEEK演算法挑戰賽代碼分享
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.preprocessing import StandardScaler
tf.reset_default_graph()
from datetime import datetime
now = datetime.utcnow().strftime("%Y%m%d%H%M%S")
root_logdir = r"D://tf_logs_2"
logdir = "{}/run_{}/".format(root_logdir,now)
voc_size=len(vocab_dict.keys())+1
embed_size=25
candidate_query=1
tag_size=22
with tf.name_scope("query_input") as query_input:
query = tf.placeholder(tf.int32, shape=(None, candidate_query), name="query")
candidate_freq=tf.placeholder(tf.float32,shape=(None,candidate_query),name="candidate_freq")
with tf.name_scope("title_input") as title_input:
title = tf.placeholder(tf.int32, shape=(None, None), name="title")
with tf.name_scope("tag_input") as tag_input:
tag = tf.placeholder(tf.float32, shape=(None, tag_size), name="tag")
with tf.name_scope("ctr_input") as ctr_input:
ctr = tf.placeholder(tf.float32, shape=(None, 1), name="ctr")
with tf.name_scope("target") as target:
target = tf.placeholder(tf.float32, shape=(None, 1), name="target")
with tf.name_scope("embedding") as embedding:
embeddings = tf.get_variable("word_embeddings",[voc_size,embed_size])
query_embeddings = tf.reshape(tf.gather(embeddings, query),(-1,candidate_query,embed_size))
query_feature=tf.reshape(tf.matmul(tf.reshape(candidate_freq,(-1,1,candidate_query)),query_embeddings)/tf.reshape(tf.reduce_sum(candidate_freq,1),(-1,1,1)),(-1,embed_size))
title_feature=tf.reduce_mean(tf.gather(embeddings, title),1)
with tf.name_scope("feature_concat") as feature_concat:
feature=tf.concat([query_feature,title_feature,tag],1)
with tf.name_scope("dense") as dense:
dense = tf.layers.dense(inputs=feature, units=128, activation=tf.nn.relu)
y_pred=tf.layers.dense(inputs=dense,units=1,activation=tf.nn.sigmoid)
with tf.name_scope("Loss") as loss:
logloss=-tf.reduce_mean(tf.log(y_pred)*target+(1-target)*tf.log(1-y_pred))
loss_summary = tf.summary.scalar(loss, logloss)
with tf.name_scope("training_op") as training_op:
optimizer = tf.train.AdamOptimizer()
training_op = optimizer.minimize(logloss)
file_writer = tf.summary.FileWriter(logdir, tf.get_default_graph())
#n_epochs=300
n_epochs=81
"""
batch_size = len(query_train)
n_batches = int( np.ceil( len(query_train) / batch_size ) )
def fetch_batch( epoch, batch_index, batch_size ):
np.random.seed( epoch * n_batches + batch_index )
indices = np.random.randint( len(query_train), size = batch_size )
query_batch = np.array(query_train)[ indices ]
candidate_freq_batch = np.array(candidate_freq_train)[indices]
title_batch = np.array(title_train)[indices]
tag_batch = np.array(tag_train)[indices]
ctr_batch = np.array(ctr_train)[indices]
target_batch = np.array(target_train)[ indices ]
return query_batch, candidate_freq_batch,title_batch,tag_batch,target_batch,ctr_batch
"""
with tf.name_scope("init") as init:
init = tf.global_variables_initializer()
l=[]
l_test=[]
with tf.Session() as sess:
sess.run(init)
for epoch in range(n_epochs): # not shown
print (epoch)
feed_dict_train={query:query_train, candidate_freq:candidate_freq_train,title:title_train,target:target_train,tag:tag_train,ctr:ctr_train }
"""
for batch in range(n_batches):
print (batch)
query_batch, candidate_freq_batch,title_batch,tag_batch,target_batch,ctr_batch=fetch_batch(epoch,batch,batch_size)
feed_dict_batch={query:query_batch, candidate_freq:candidate_freq_batch,title:title_batch,target:target_batch,tag:tag_batch,ctr:ctr_batch }
"""
sess.run(training_op, feed_dict_train)
e=sess.run(logloss,feed_dict_train)
feed_dict_vali={query:query_vali, candidate_freq:candidate_freq_vali,title:title_vali,target:target_vali,tag:tag_vali,ctr:ctr_vali }
summary_str=loss_summary.eval(feed_dict=feed_dict_vali)
file_writer.add_summary(summary_str, epoch)
l.append(e)
feed_dict_test={query:query_test, candidate_freq:candidate_freq_test,title:title_test,tag:tag_test,ctr:ctr_test }
ypred=sess.run(y_pred,feed_dict=feed_dict_test)
ypred_vali=sess.run(y_pred,feed_dict=feed_dict_vali)
tf.add_to_collection(y_pred,y_pred)
tf.add_to_collection(feature2,feature2)
saver = tf.train.Saver()
saver.save(sess,"./checkpoint_dir/MyModel_"+now)
file_writer.close()
先上代碼。
方法整體思路為:
(1)query,title轉化為id接 embedding層
(2)query,title embedding層輸出結果與tag one hot 編碼拼接
(3)上步得到的特徵向量接兩個dense層分別用relu和 logistic激活函數,其輸出作為點擊概率
以0.35-0.4作為正負樣本閾值,線上可以達到0.7左右(演算法運行有一定隨機性,不保證每次都有相同的結果,會有一定偏差,偏差不大)
數據集預處理的代碼先不放了,整體思路如下:
query,title轉化為id,
target獨熱編碼,
ctr暫時也沒有用,
candidate_freq為長度為1的向量(之前打算對所有的候選query都做嵌入,可是發現不太好處理就沒做,這一塊也就沒啥用了)
推薦閱讀:
TAG:科技 |