ExploringGraphRecommendationSystems/data.py at main · gfkaceli/ExploringGraphRecommendationSystems · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
import pandas as pd
from convert import convert_categorical
import networkx as nx
import itertools
import collections

ratings_path = 'datasets/ml-100k/ua.base'
users_path = 'datasets/ml-100k/u.user'
items_path = 'datasets/ml-100k/u.item'
# Load ratings
ratings = pd.read_csv(ratings_path, sep='\t', names=['UID', 'MID', 'rating', 'timestamp'],
                      encoding='latin-1')

# Load users
users = pd.read_csv(users_path, sep='|', names=['UID', 'age', 'gender', 'job', 'zip'],
                    encoding='latin-1')

# Load movies (items)
movies = pd.read_csv(items_path, sep='|', names=['MID', 'title', 'release_date', 'video_release_date',
                                                 'IMDb_URL', 'unknown', 'Action', 'Adventure', 'Animation',
                                                 "Children's", 'Comedy', 'Crime', 'Documentary', 'Drama',
                                                 'Fantasy', 'Film-Noir', 'Horror', 'Musical', 'Mystery',
                                                 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western'], encoding='latin-1')

users = convert_categorical(users, 'job')
users = convert_categorical(users, 'gender')
users['bin'] = pd.cut(users['age'], [0, 10, 20, 30, 40, 50, 100], labels=['1', '2', '3', '4', '5', '6'])
"""
    the above line groups the ages together, so ages
    0-10 for age1 group
    10-20 for age2
    20-30 for age3
    30-40 for age4
    40-50 fro age5
    50-100 for age6
    we use this to define the boundaries when we encode the data
"""
users['age'] = users['bin']

users = users.drop('bin', axis=1)
users = convert_categorical(users, 'age')
users = users.drop('zip', axis=1)
movies = movies.drop(['video_release_date', 'IMDb_URL', 'title', 'release_date'], axis=1) # drop these axes
ratings['datetime'] = pd.to_datetime(ratings['timestamp'], unit='s')
ratings['timestamp'] = ratings['datetime']
ratings = ratings.drop('datetime', axis=1)
ratings = ratings.drop('timestamp', axis=1)
print(movies.columns)
print(ratings.head())
print(users.columns)
ratings.to_csv("datasets/encoded/ratings.csv", index=False)
movies.to_csv("datasets/encoded/movies.csv", index=False)
alpha_coefs = [0.01]
G = nx.Graph()
for alpha_coef in alpha_coefs:
    pairs = []
    grouped = ratings.groupby(['MID', 'rating'])
    for key, group in grouped:
        pairs.extend(list(itertools.combinations(group['UID'], 2)))
    counter = collections.Counter(pairs)
    alpha = alpha_coef * 1682  # param*i_no
    edge_list = map(list, collections.Counter(el for el in counter.elements() if counter[el] >= alpha).keys())

    for el in edge_list:
        G.add_edge(el[0], el[1], weight=1)
        G.add_edge(el[0], el[0], weight=1)
        G.add_edge(el[1], el[1], weight=1)

    pr = nx.pagerank(G.to_directed())
    users['PR'] = users['UID'].map(pr)
    users['PR'] /= float(users['PR'].max())
    dc = nx.degree_centrality(G)
    users['CD'] = users['UID'].map(dc)
    users['CD'] /= float(users['CD'].max())
    cc = nx.closeness_centrality(G)
    users['CC'] = users['UID'].map(cc)
    users['CC'] /= float(users['CC'].max())
    bc = nx.betweenness_centrality(G)
    users['CB'] = users['UID'].map(bc)
    users['CB'] /= float(users['CB'].max())
    lc = nx.load_centrality(G)
    users['LC'] = users['UID'].map(lc)
    users['LC'] /= float(users['LC'].max())
    nd = nx.average_neighbor_degree(G, weight='weight')
    users['AND'] = users['UID'].map(nd)
    users['AND'] /= float(users['AND'].max())
    X_train = users[users.columns[0:]]
    X_train.fillna(0, inplace=True)
    X_train.to_csv(f"datasets/encoded/{alpha_coef}users.csv", index=False)