@Author: Tung Nguyen
@Version: 1.2
#@title
from sklearn.datasets import dump_svmlight_file
import numpy as np
import pandas as pd
import os
import urllib
import zipfile
from sklearn.model_selection import train_test_split
import shutil
#@title
datasets = {'ml100k':'http://files.grouplens.org/datasets/movielens/ml-100k.zip',
'ml20m':'http://files.grouplens.org/datasets/movielens/ml-20m.zip',
'mllatestsmall':'http://files.grouplens.org/datasets/movielens/ml-latest-small.zip',
'ml10m':'http://files.grouplens.org/datasets/movielens/ml-10m.zip',
'ml1m':'http://files.grouplens.org/datasets/movielens/ml-1m.zip'
}
print('Available datasets: ', [key for key in datasets])
dt = input('Dataset name = ')
print('You selected {}'.format(dt))
# Download the file from `url` and save it locally under `file_name`:
dt_name = os.path.basename(datasets[dt])
print('Downloading {}'.format(dt_name))
with urllib.request.urlopen(datasets[dt]) as response, open('./sample_data/'+dt_name, 'wb') as out_file:
shutil.copyfileobj(response, out_file)
print('Download completed')
with zipfile.ZipFile('./sample_data/'+dt_name, 'r') as zip_ref:
zip_ref.extractall('./sample_data/')
dt_dir_name = os.path.splitext(dt_name)[0]
#Check unzipped structure
def list_files(startpath):
print(startpath)
for root, dirs, files in os.walk(startpath):
level = root.replace(startpath, '').count(os.sep)
indent = ' ' * 4 * (level)
print('{}{}/'.format(indent, os.path.basename(root)))
subindent = ' ' * 4 * (level + 1)
for f in files:
print('{}{}'.format(subindent, f))
dirs = [x[0] for x in os.walk("./sample_data")]
#print(list(dirs[0]))
ml = filter(lambda dirName: dirName if ('ml' in dirName) else '', list(dirs))
dt_dir_name= list(ml)[0]
print(dt_dir_name)
#list_files('./sample_data/'+dt_dir_name +'/')
You may need to select the data file to check the format
# lets explore movies.csv
if dt=='ml100k':
rdata= pd.read_csv(dt_dir_name +'/'+ 'u.data', delimiter='\t', names=['userId', 'movieId', 'rating', 'timestamp'])
usrdata= pd.read_csv(dt_dir_name +'/'+ 'u.user', delimiter='|', names=['user id', 'age' ,'gender' ,'occupation' , 'zip code'])
rdata.shape
elif dt=='mllatestsmall':
rdata=pd.read_csv(dt_dir_name +'/'+ 'ratings.csv')#, delimiter='\t'), names=['userid', 'movieId', 'rating', 'timestamp'])
tagdata=pd.read_csv(dt_dir_name +'/'+ 'tags.csv')
rdata.shape
elif dt=='ml1m':
rdata=pd.read_csv(dt_dir_name +'/'+ 'ratings.dat', delimiter='\:\:', names=['userId', 'movieId', 'rating', 'timestamp'])
elif dt=='ml20m':
rdata=pd.read_csv(dt_dir_name +'/'+ 'ratings.csv')#, delimiter=',', names=['userid', 'movieId', 'rating', 'timestamp'])
tagdata=pd.read_csv(dt_dir_name +'/'+ 'tags.csv')
elif dt=='ml10m':
rdata=pd.read_csv(dt_dir_name +'/'+ 'ratings.dat', delimiter='\:\:', names=['userId', 'movieId', 'rating', 'timestamp'])
rdata.head()
rdata.info()
#is any row null
rdata.isnull().any()
usrdata.head()
usrdata.isnull().any()
If available (latest)
tagdata.head()
tagdata.isnull().any()
#Version 1.1
import random
import time
def neg_sampling(ratings):
"""version 1.1:1 positive 2 neg (3 times bigger than the original dataset)
Parameters:
input rating data as pandas dataframe: userId|movieId|rating
Returns:
negative sampled set as pandas dataframe
userId|movieId|interact (implicit)
"""
users = ratings['userId'].drop_duplicates() #get list uid
movies = ratings['movieId'].drop_duplicates() #get list mid
nsamples = ratings[['userId', 'movieId', 'timestamp']]
nsamples['interact'] = nsamples.apply(lambda row: 1, axis=1)
nTempData = []
start_time = time.time()
stop_time = time.time()
for i, row in ratings.iterrows():
if(i%5000==0):
stop_time = time.time()
print("processed ... {0:0.2f}% ...{1:0.2f}secs".format(float(i)*100 / len(ratings), stop_time - start_time))
start_time = stop_time
u=row['userId']
j = 2
while j > 0:
m= movies.sample(n=1).values[0] #get random movie id
# insert 2 negative samples if u-m not exist
if(not ((nsamples['userId']==u) & (nsamples['movieId']==m)).any()):
j -= 1
nTempData.append([u, m, int(time.time()), -1])
ntempdf = pd.DataFrame(data=nTempData, columns=['userId', 'movieId', 'timestamp', 'interact' ])
nsamples = pd.concat([nsamples, ntempdf],ignore_index=True)
return nsamples
#get negative samples to ns var. Later can save it to train/test/. The generation data is quite slow at version 1.
ns = neg_sampling(rdata)
print("done: ", ns.shape)
ns.tail(5)
In ml100k, there are some train/test sets avaiables
#From rating, drop timestamp
rX_train, rX_test = train_test_split(rdata[['userId', 'movieId', 'rating']], test_size=0.2, random_state = 101)
rX_train, rX_val = train_test_split(rX_train, test_size=0.2, random_state= 101)
print(rX_train.shape, rX_test.shape, rX_val.shape)
# Check the train set
rX_train.head()
rX_train_ns, rX_test_ns = train_test_split(ns, test_size=0.2, random_state = 101)
rX_train_ns, rX_val_ns = train_test_split(rX_train_ns, test_size=0.2, random_state= 101)
print(rX_train_ns.shape, rX_test_ns.shape, rX_val_ns.shape)
# Check the train set ns
rX_train_ns.head()
def save_to_csv(data_frames, out_file_names, outdir='./sample_data/processed/'):
if not os.path.exists(outdir):
os.mkdir(outdir)
assert len(data_frames)==len(out_file_names), "number of dataframes must equal number of file names"
for i in range(len(out_file_names)):
data_frames[i].to_csv(outdir+out_file_names[i], header=None, index=False)
#Train, test, validation of original data as CSV
save_to_csv([rX_train, rX_test, rX_val], ['rX_train.csv', 'rX_test.csv', 'rx_val.csv'] )
save_to_csv([rX_train_ns, rX_test_ns, rX_val_ns], ['rX_train_ns.csv', 'rX_test_ns.csv', 'rX_val_ns.csv'] )
def save_to_libfm(data_frames, out_file_names, target='rating', outdir='./sample_data/processed/'):
if not os.path.exists(outdir):
os.mkdir(outdir)
assert len(data_frames)==len(out_file_names), "number of dataframes must equal number of file names"
for i in range(len(out_file_names)):
dump_svmlight_file(data_frames[i][['userId', 'movieId', 'timestamp']],data_frames[i][[target]].values.ravel(), outdir+out_file_names[i],zero_based=True,multilabel=False)
#orginal data
save_to_libfm([rX_train, rX_test, rX_val], ['rX_train.libfm', 'rX_test.libfm', 'rx_val.libfm'])
#negative sampled data
save_to_libfm([rX_train_ns, rX_test_ns, rX_val_ns], ['rX_train_ns.libfm', 'rX_test_ns.libfm', 'rx_val_ns.libfm'], target='interact')
import shutil
zipFileName = 'ml_processed'
shutil.make_archive(zipFileName, 'zip', outdir)
#download file to local system
from google.colab import files
files.download(zipFileName+'.zip')
path =./ in drive
# Import PyDrive and associated libraries.
# This only needs to be done once in a notebook.
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials
# Authenticate and create the PyDrive client.
# This only needs to be done once in a notebook.
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)
# Create & upload a text file.
uploaded = drive.CreateFile({'title': zipFileName+'.zip'})
uploaded.SetContentFile(zipFileName+'.zip')
uploaded.Upload()
print('Uploaded file with ID {}'.format(uploaded.get('id')))