윤정환

코드, 데이터셋 업로드

#!/usr/bin/env python
# coding: utf-8
# In[ ]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from pathlib import Path
import matplotlib.pyplot as plt
dataset_file = open("dataset.csv",'r')
df = pd.read_csv(dataset_file)
user_ids = df["userid"].unique().tolist()
user2user_encoded = {x: i for i, x in enumerate(user_ids)}
userencoded2user = {i: x for i, x in enumerate(user_ids)}
contents_ids = df["contentsid"].unique().tolist()
contents2contents_encoded = {x: i for i, x in enumerate(contents_ids)}
contents_encoded2contents = {i: x for i, x in enumerate(contents_ids)}
df["user"] = df["userid"].map(user2user_encoded)
df["contents"] = df["contentsid"].map(contents2contents_encoded)
num_users = len(user2user_encoded)
num_contents = len(contents_encoded2contents)
df["rating"] = df["rating"].values.astype(np.float32)
# min and max ratings will be used to normalize the ratings later
min_rating = 0.5
max_rating = 5.0
print(
"Number of users: {}, Number of Contents: {}, Min rating: {}, Max rating: {}".format(
num_users, num_contents, min_rating, max_rating
)
)
df = df.sample(frac=1, random_state=42)
x = df[["user", "contents"]].values
# Normalize the targets between 0 and 1. Makes it easy to train.
y = df["rating"].apply(lambda x: (x - min_rating) / (max_rating - min_rating)).values
# Assuming training on 90% of the data and validating on 10%.
train_indices = int(0.9 * df.shape[0])
x_train, x_val, y_train, y_val = (
x[:train_indices],
x[train_indices:],
y[:train_indices],
y[train_indices:],
)
EMBEDDING_SIZE = 50
class RecommenderNet(keras.Model):
def __init__(self, num_users, num_contents, embedding_size, **kwargs):
super(RecommenderNet, self).__init__(**kwargs)
self.num_users = num_users
self.num_contents = num_contents
self.embedding_size = embedding_size
self.user_embedding = layers.Embedding(
num_users,
embedding_size,
embeddings_initializer="he_normal",
embeddings_regularizer=keras.regularizers.l2(1e-6),
)
self.user_bias = layers.Embedding(num_users, 1)
self.contents_embedding = layers.Embedding(
num_contents,
embedding_size,
embeddings_initializer="he_normal",
embeddings_regularizer=keras.regularizers.l2(1e-6),
)
self.contents_bias = layers.Embedding(num_contents, 1)
def call(self, inputs):
user_vector = self.user_embedding(inputs[:, 0])
user_bias = self.user_bias(inputs[:, 0])
contents_vector = self.contents_embedding(inputs[:, 1])
contents_bias = self.contents_bias(inputs[:, 1])
dot_user_contents = tf.tensordot(user_vector, contents_vector, 2)
# Add all the components (including bias)
x = dot_user_contents + user_bias + contents_bias
# The sigmoid activation forces the rating to between 0 and 1
return tf.nn.sigmoid(x)
model = RecommenderNet(num_users, num_contents, EMBEDDING_SIZE)
model.compile(
optimizer='sgd',
loss='mse',
metrics=[tf.keras.metrics.MeanSquaredError()])
history = model.fit(
x=x_train,
y=y_train,
batch_size=2,
epochs=20,
verbose=1,
validation_data=(x_val, y_val),
)
plt.plot(history.history["loss"])
plt.plot(history.history["val_loss"])
plt.title("model loss")
plt.ylabel("loss")
plt.xlabel("epoch")
plt.legend(["train", "test"], loc="upper left")
plt.show()
test_file = open("dataset_test.csv",'r')
tf = pd.read_csv(test_file)
user_ids = tf["userid"].unique().tolist()
user2user_encoded = {x: i for i, x in enumerate(user_ids)}
userencoded2user = {i: x for i, x in enumerate(user_ids)}
contents_ids = tf["contentsid"].unique().tolist()
contents2contents_encoded = {x: i for i, x in enumerate(contents_ids)}
contents_encoded2contents = {i: x for i, x in enumerate(contents_ids)}
tf["user"] = tf["userid"].map(user2user_encoded)
tf["contents"] = tf["contentsid"].map(contents2contents_encoded)
tf["rating"] = tf["rating"].values.astype(np.float32)
tf = tf.sample(frac=1, random_state=42)
x = tf[["user", "contents"]].values
y = tf["rating"].apply(lambda x: (x - min_rating) / (max_rating - min_rating)).values
x_test, y_test = (x, y)
result = model.evaluate(x_test, y_test)
print(result)
#!/usr/bin/env python
# coding: utf-8
# In[ ]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from pathlib import Path
import matplotlib.pyplot as plt
df_x = pd.read_csv("x_train.csv")
df_y = pd.read_csv("y_train.csv")
df = pd.concat([df_x, df_y], axis=1)
user_ids = df["userid"].unique().tolist()
user2user_encoded = {x: i for i, x in enumerate(user_ids)}
userencoded2user = {i: x for i, x in enumerate(user_ids)}
task_ids = df["taskid"].unique().tolist()
task2task_encoded = {x: i for i, x in enumerate(task_ids)}
task_encoded2task = {i: x for i, x in enumerate(task_ids)}
df["user"] = df["userid"].map(user2user_encoded)
df["task"] = df["taskid"].map(task2task_encoded)
num_users = len(user2user_encoded)
num_task = len(task_encoded2task)
df["rating"] = df["rating"].values.astype(np.float32)
# min and max ratings will be used to normalize the ratings later
MIN_RATING = 0.5
MAX_RATING = 5.0
print(
"Number of users: {}, Number of task: {}, Min rating: {}, Max rating: {}".format(
num_users, num_task, MIN_RATING, MAX_RATING
)
)
df = df.sample(frac=1, random_state=42)
x = df[["user", "task"]].values
# Normalize the targets between 0 and 1. Makes it easy to train.
y = df["rating"].apply(lambda x: (x - MIN_RATING) / (MAX_RATING - MIN_RATING)).values
# Assuming training on 90% of the data and validating on 10%.
train_indices = int(0.9 * df.shape[0])
x_train, x_val, y_train, y_val = (
x[:train_indices],
x[train_indices:],
y[:train_indices],
y[train_indices:],
)
EMBEDDING_SIZE = 128
class RecommenderNet(keras.Model):
def __init__(self, num_users, num_task, embedding_size, **kwargs):
super(RecommenderNet, self).__init__(**kwargs)
self.num_users = num_users
self.num_task = num_task
self.embedding_size = embedding_size
self.user_embedding = layers.Embedding(
num_users,
embedding_size,
embeddings_initializer="he_normal",
embeddings_regularizer=keras.regularizers.l2(1e-6),
)
self.user_bias = layers.Embedding(num_users, 1)
self.task_embedding = layers.Embedding(
num_task,
embedding_size,
embeddings_initializer="he_normal",
embeddings_regularizer=keras.regularizers.l2(1e-6),
)
self.task_bias = layers.Embedding(num_task, 1)
def call(self, inputs):
user_vector = self.user_embedding(inputs[:, 0])
user_bias = self.user_bias(inputs[:, 0])
task_vector = self.task_embedding(inputs[:, 1])
task_bias = self.task_bias(inputs[:, 1])
dot_user_task = tf.tensordot(user_vector, task_vector, 2)
# Add all the components (including bias)
x = dot_user_task + user_bias + task_bias
# The sigmoid activation forces the rating to between 0 and 1
return tf.nn.sigmoid(x)
model = RecommenderNet(num_users, num_task, EMBEDDING_SIZE)
model.compile(
optimizer='adam',
loss='mse',
metrics=[tf.keras.metrics.MeanSquaredError()])
history = model.fit(
x=x_train,
y=y_train,
batch_size=8,
epochs=300,
verbose=1,
validation_data=(x_val, y_val),
)
df_x_test = pd.read_csv('x_test.csv')
df_x_test["user"] = df_x_test["userid"].map(user2user_encoded)
df_x_test["task"] = df_x_test["taskid"].map(task2task_encoded)
x_test = df_x_test[["user", "task"]].values
y_pred = model.predict(x_test)
df_y_pred = pd.DataFrame(y_pred, columns=['rating'])
df_y_pred = df_y_pred["rating"].apply(lambda x: (x * (MAX_RATING - MIN_RATING) + MIN_RATING ))
df_y_pred.to_csv('y_pred.csv', sep=',', columns = ['rating'], index = False)
#evaluate
import os
import sys
import pandas as pd
from sklearn.metrics import mean_squared_error
gt = pd.read_csv('y_test.csv', header=0)
pr = pd.read_csv('y_pred.csv', header=0)
gt = gt.to_numpy().astype(float).reshape(-1)
pr = pr.to_numpy().astype(float).reshape(-1)
score = mean_squared_error(gt, pr, squared = False)
print("score:", score)
This diff is collapsed. Click to expand it.
userid,contentsid,rating
1,T000043,5
1,T000055,0.5
1,T000072,0.5
1,T000064,5
1,T001630,0.5
1,T000308,0.5
1,T000293,0.5
1,T001616,0.5
1,T001613,5
1,T001601,0.5
1,T001919,0.5
1,T001946,5
2,T000046,5
2,T000051,0.5
2,T000074,0.5
2,T000308,0.5
2,T000307,5
2,T000299,0.5
2,T000291,0.5
2,T001613,5
2,T001607,0.5
2,T001920,5
2,T001916,0.5
2,T001943,0.5
3,T000036,5
3,T000049,0.5
3,T000053,0.5
3,T000061,5
3,T000073,0.5
3,T001628,5
3,T000302,5
3,T000212,0.5
3,T001616,0.5
3,T001606,0.5
3,T001920,0.5
3,T001915,0.5
4,T001947,0.5
4,T001921,0.5
4,T001617,0.5
4,T001606,0.5
4,T000040,0.5
4,T000045,0.5
4,T000060,5
4,T000077,0.5
4,T000068,0.5
4,T000302,0.5
4,T000293,0.5
4,T000288,0.5
5,T001956,5
5,T001915,0.5
5,T001611,5
5,T001604,5
5,T000046,0.5
5,T000056,5
5,T000073,0.5
5,T000065,0.5
5,T001630,0.5
5,T000309,0.5
5,T000299,5
5,T000294,0.5
6,T001943,5
6,T001911,0.5
6,T000036,0.5
6,T000050,5
6,T000056,0.5
6,T000059,5
6,T000074,5
6,T000072,0.5
6,T000071,0.5
6,T000293,0.5
6,T000292,5
6,T000212,0.5
7,T000053,5
7,T000054,5
7,T000060,0.5
7,T000078,0.5
7,T000071,0.5
7,T000298,0.5
7,T000288,0.5
7,T001608,0.5
7,T001606,5
7,T001917,0.5
7,T001915,0.5
7,T001914,5
8,T000040,0.5
8,T000044,0.5
8,T000053,0.5
8,T000059,5
8,T000061,5
8,T000072,0.5
8,T001631,0.5
8,T000301,0.5
8,T000295,5
8,T000294,5
8,T001616,5
8,T001944,5
9,T000049,5
9,T000051,0.5
9,T000054,5
9,T000055,5
9,T000056,5
9,T000311,5
9,T000309,5
9,T000297,5
9,T000289,5
9,T001614,0.5
9,T001612,0.5
9,T001956,0.5
10,T000045,0.5
10,T000053,0.5
10,T000061,5
10,T000069,0.5
10,T000068,0.5
10,T000312,0.5
10,T000303,0.5
10,T000297,0.5
10,T000287,5
10,T001615,5
10,T001609,5
10,T001915,5
11,T000036,5
11,T000046,0.5
11,T000054,5
11,T000077,0.5
11,T000069,0.5
11,T001628,5
11,T000308,5
11,T000301,0.5
11,T000288,5
11,T001610,0.5
11,T001600,5
11,T001914,5
12,T000045,0.5
12,T000048,0.5
12,T000056,5
12,T000075,5
12,T000063,0.5
12,T001631,5
12,T000309,0.5
12,T000302,0.5
12,T000291,5
12,T001616,0.5
12,T001918,5
12,T001947,5
13,T000043,5
13,T000052,5
13,T000059,5
13,T000071,5
13,T001628,0.5
13,T000307,5
13,T000299,0.5
13,T000291,5
13,T001614,0.5
13,T001604,5
13,T001921,0.5
13,T001911,0.5