-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmain.py
85 lines (69 loc) · 3.03 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
# import libraries
import numpy as np
import os
import pandas as pd
from sklearn.metrics.pairwise import pairwise_distances
from dotenv import load_dotenv
load_dotenv()
# constants
CSV_FILEPATH_BASE = os.getenv('CSV_FILEPATH_BASE')
CSV_USERS_FILEPATH = os.getenv('CSV_USERS_FILEPATH')
CSV_RATINGS_FILEPATH = os.getenv('CSV_RATINGS_FILEPATH')
CSV_ITEMS_FILEPATH = os.getenv('CSV_ITEMS_FILEPATH')
print('\nCSV filepaths are as follows:\n{}\n{}\n{}\n'.format(CSV_USERS_FILEPATH, CSV_RATINGS_FILEPATH, CSV_ITEMS_FILEPATH))
# read user file
u_cols = ['user_id', 'age', 'sex', 'occupation', 'zip_code']
users = pd.read_csv(CSV_USERS_FILEPATH, sep='|', names=u_cols,encoding='latin-1')
# read ratings file
r_cols = ['user_id', 'movie_id', 'rating', 'unix_timestamp']
ratings = pd.read_csv(CSV_RATINGS_FILEPATH, sep='\t', names=r_cols,encoding='latin-1')
# read items file
i_cols = ['movie id', 'movie title' ,'release date','video release date', 'IMDb URL', 'unknown', 'Action', 'Adventure',
'Animation', 'Children\'s', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy',
'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']
items = pd.read_csv(CSV_ITEMS_FILEPATH, sep='|', names=i_cols,
encoding='latin-1')
# look at user data
print('\nUser Data :')
print('shape : ', users.shape)
print(users.head())
# look at ratings data
print("\nRatings Data :")
print("shape : ", ratings.shape)
print(ratings.head())
# look at items data
print("\nItem Data :")
print("shape : ", items.shape)
print(items.head(6))
ratings_train = pd.read_csv(CSV_FILEPATH_BASE + 'ua.base', sep='\t', names=r_cols, encoding='latin-1')
ratings_test = pd.read_csv(CSV_FILEPATH_BASE + 'ua.test', sep='\t', names=r_cols, encoding='latin-1')
print('\nratings_train.shape, ratings_test.shape')
print(ratings_train.shape, ratings_test.shape)
# find number of unique users, items
n_users = ratings.user_id.unique().shape[0]
n_items = ratings.movie_id.unique().shape[0]
print('\nn_users, n_items')
print(n_users, n_items)
# create user-item matrix
data_matrix = np.zeros((n_users, n_items))
for line in ratings.itertuples():
data_matrix[line[1]-1, line[2]-1] = line[3]
print('\ndata_matrix')
print(data_matrix)
user_similarity = pairwise_distances(data_matrix, metric='cosine')
item_similarity = pairwise_distances(data_matrix.T, metric='cosine')
def predict(ratings, similarity, type='user'):
if type == 'user':
mean_user_rating = ratings.mean(axis=1)
# np.newaxis - mean_user_rating has same format as ratings
ratings_diff = (ratings - mean_user_rating[:, np.newaxis])
pred = mean_user_rating[:, np.newaxis] + similarity.dot(ratings_diff) / np.array([np.abs(similarity).sum(axis=1)]).T
elif type == 'item':
pred = ratings.dot(similarity) / np.array([np.abs(similarity).sum(axis=1)])
return pred
user_prediction = predict(data_matrix, user_similarity, type='user')
item_prediction = predict(data_matrix, item_similarity, type='item')
print('\nuser_prediction')
print(user_prediction)
print('\nitem_prediction')
print(item_prediction)