-
Notifications
You must be signed in to change notification settings - Fork 4
Expand file tree
/
Copy pathBR_user.py
More file actions
111 lines (93 loc) · 3.88 KB
/
BR_user.py
File metadata and controls
111 lines (93 loc) · 3.88 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
# coding: utf-8 -*-
import math
import pandas as pd
class UserCf:
# 这个类的主要功能是提供一个基于用户的协调过滤算法接口
def __init__(self):
self.file_path = './datasets/new/users.csv'
self._init_frame()
def _init_frame(self):
self.frame = pd.read_csv(self.file_path)
self.frame.columns=['UserID','BookID','Rating']
@staticmethod
def _cosine_sim(target_books, books):
'''
simple method for calculate cosine distance.
'''
union_len = len(set(target_books) & set(books))
if union_len == 0: return 0.0
product = len(target_books) * len(books)
cosine = union_len / math.sqrt(product)
return cosine
def _get_top_n_users(self, target_user_id, top_n):
'''
calculate similarity between all users and return Top N similar users.
'''
target_books = self.frame[self.frame['UserID'] == target_user_id]['BookID']
other_users_id = [i for i in set(self.frame['UserID']) if i != target_user_id]
other_books = [self.frame[self.frame['UserID'] == i]['BookID'] for i in other_users_id]
sim_list = [self._cosine_sim(target_books, books) for books in other_books]
sim_list = sorted(zip(other_users_id, sim_list), key=lambda x: x[1], reverse=True)
return sim_list[:top_n]
def _get_candidates_items(self, target_user_id):
"""
Find all books in source data and target_user did not meet before.
"""
target_user_books = set(self.frame[self.frame['UserID'] == target_user_id]['BookID'])
other_user_books = set(self.frame[self.frame['UserID'] != target_user_id]['BookID'])
candidates_books = list(target_user_books ^ other_user_books)
return candidates_books
def _get_top_n_items(self, top_n_users, candidates_books, top_n):
"""
calculate interest of candidates movies and return top n movies.
"""
top_n_user_data = [self.frame[self.frame['UserID'] == k] for k, _ in top_n_users]
interest_list = []
for book_id in candidates_books:
tmp = []
for user_data in top_n_user_data:
if book_id in user_data['BookID'].values:
readdf = user_data[user_data['BookID'] == book_id]
tmp.append(round(readdf['Rating'].mean(),2))
else:
tmp.append(0)
interest = sum([top_n_users[i][1] * tmp[i] for i in range(len(top_n_users))])
interest_list.append((book_id, interest))
interest_list = sorted(interest_list, key=lambda x: x[1], reverse=True)
return interest_list[:top_n]
def calculate(self, target_user_id, top_n):
"""
user-cf for books recommendation.
"""
# most similar top n users
top_n_users = self._get_top_n_users(target_user_id, top_n)
# candidates books for recommendation
candidates_books = self._get_candidates_items(target_user_id)
# most interest top n books
top_n_books = self._get_top_n_items(top_n_users, candidates_books, top_n)
print(top_n_books)
name = []
values = []
for x in top_n_books:
name.append(x[0])
values.append(x[1])
df = pd.DataFrame({'UserID':target_user_id,'BookID':name,'score':values})
return df
def run(i):
global res
target_user_id = users[i]
DF = usercf.calculate(target_user_id, top_n)
res = res.append(DF)
path = './datasets/new/bookrating.csv'
Data = pd.read_csv(path)
Data.columns = ['UserID','BookID','Rating']
res = pd.DataFrame(columns=['UserID','BookID','score'])
usercf = UserCf()
import random
users = [random.choice(list(set(Data['UserID']))) for x in range(20)]
top_n = 10
for x in range(len(users)):
print(x)
run(x)
print(res)
res.to_csv('./datasets/new/booktuijian.csv')