@Author: Tung Nguyen

@Version: 1.2

Import section¶

#@title
from sklearn.datasets import dump_svmlight_file
import numpy as np
import pandas as pd
import os
import urllib
import zipfile
from sklearn.model_selection import train_test_split
import shutil

#@title
datasets = {'ml100k':'http://files.grouplens.org/datasets/movielens/ml-100k.zip',
            'ml20m':'http://files.grouplens.org/datasets/movielens/ml-20m.zip',
            'mllatestsmall':'http://files.grouplens.org/datasets/movielens/ml-latest-small.zip',
            'ml10m':'http://files.grouplens.org/datasets/movielens/ml-10m.zip',
            'ml1m':'http://files.grouplens.org/datasets/movielens/ml-1m.zip'
            }

Select dataset to process¶

Note: you need to type in your dataset in the input box¶

print('Available datasets: ', [key for key in datasets])
dt = input('Dataset name = ')
print('You selected {}'.format(dt))

Available datasets:  ['ml100k', 'ml20m', 'mllatestsmall', 'ml10m', 'ml1m']
Dataset name = ml100k
You selected ml100k

Download and extract¶

# Download the file from `url` and save it locally under `file_name`:
dt_name = os.path.basename(datasets[dt])

print('Downloading {}'.format(dt_name))
with urllib.request.urlopen(datasets[dt]) as response, open('./sample_data/'+dt_name, 'wb') as out_file:
    shutil.copyfileobj(response, out_file)
print('Download completed')

Downloading ml-100k.zip
Download completed

with zipfile.ZipFile('./sample_data/'+dt_name, 'r') as zip_ref:
    zip_ref.extractall('./sample_data/')
dt_dir_name = os.path.splitext(dt_name)[0]

#Check unzipped structure
def list_files(startpath):
    print(startpath)
    for root, dirs, files in os.walk(startpath):
        level = root.replace(startpath, '').count(os.sep)
        indent = ' ' * 4 * (level)
        print('{}{}/'.format(indent, os.path.basename(root)))
        subindent = ' ' * 4 * (level + 1)
        for f in files:
            print('{}{}'.format(subindent, f))
dirs = [x[0] for x in os.walk("./sample_data")]
#print(list(dirs[0]))
ml = filter(lambda dirName: dirName if ('ml' in dirName) else '', list(dirs))
dt_dir_name= list(ml)[0]
print(dt_dir_name)
#list_files('./sample_data/'+dt_dir_name +'/')

./sample_data/ml-100k

Manual section¶

You may need to select the data file to check the format

# lets explore movies.csv
if dt=='ml100k':
  rdata= pd.read_csv(dt_dir_name +'/'+ 'u.data', delimiter='\t', names=['userId', 'movieId', 'rating', 'timestamp'])
  usrdata= pd.read_csv(dt_dir_name +'/'+ 'u.user', delimiter='|', names=['user id', 'age' ,'gender' ,'occupation' , 'zip code'])
  rdata.shape
elif dt=='mllatestsmall':
  rdata=pd.read_csv(dt_dir_name +'/'+ 'ratings.csv')#, delimiter='\t'), names=['userid', 'movieId', 'rating', 'timestamp'])  
  tagdata=pd.read_csv(dt_dir_name +'/'+ 'tags.csv')
  rdata.shape
elif dt=='ml1m':
  rdata=pd.read_csv(dt_dir_name +'/'+ 'ratings.dat', delimiter='\:\:', names=['userId', 'movieId', 'rating', 'timestamp'])  
  
elif dt=='ml20m':
  rdata=pd.read_csv(dt_dir_name +'/'+ 'ratings.csv')#, delimiter=',', names=['userid', 'movieId', 'rating', 'timestamp'])  
  tagdata=pd.read_csv(dt_dir_name +'/'+ 'tags.csv')
elif dt=='ml10m':
  rdata=pd.read_csv(dt_dir_name +'/'+ 'ratings.dat', delimiter='\:\:', names=['userId', 'movieId', 'rating', 'timestamp'])

Optional: Check rating data¶

rdata.head()

rdata.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100836 entries, 0 to 100835
Data columns (total 4 columns):
userId       100836 non-null int64
movieId      100836 non-null int64
rating       100836 non-null float64
timestamp    100836 non-null int64
dtypes: float64(1), int64(3)
memory usage: 3.1 MB

#is any row null
rdata.isnull().any()

userId       False
movieId      False
rating       False
timestamp    False
dtype: bool

Check user data¶

usrdata.head()

usrdata.isnull().any()

user id       False
age           False
gender        False
occupation    False
zip code      False
dtype: bool

Check tag data¶

If available (latest)

tagdata.head()

tagdata.isnull().any()

userId       False
movieId      False
tag          False
timestamp    False
dtype: bool

Optional: Negative Sampling set¶

Can optimize for faster generation¶

(Check out V.1.1 below)

Negative Sampling V1.1 (faster)¶

#Version 1.1
import random
import time

def neg_sampling(ratings):
  """version 1.1:1 positive 2 neg (3 times bigger than the original dataset)

    Parameters:
    input rating data as pandas dataframe: userId|movieId|rating

    Returns:
    negative sampled set as pandas dataframe
            userId|movieId|interact (implicit)
  """
  users = ratings['userId'].drop_duplicates() #get list uid
  movies = ratings['movieId'].drop_duplicates() #get list mid
  nsamples = ratings[['userId', 'movieId', 'timestamp']]
  nsamples['interact'] = nsamples.apply(lambda row: 1, axis=1)
  nTempData = []
  start_time = time.time()
  stop_time = time.time()
  for i, row in ratings.iterrows():
    if(i%5000==0):
      stop_time = time.time()
      print("processed ... {0:0.2f}% ...{1:0.2f}secs".format(float(i)*100 / len(ratings), stop_time - start_time))
      start_time = stop_time
    u=row['userId']
    j = 2
    while j > 0:
      m= movies.sample(n=1).values[0] #get random movie id
      # insert 2 negative samples if u-m not exist 
      if(not ((nsamples['userId']==u) & (nsamples['movieId']==m)).any()):
          j -= 1
          nTempData.append([u, m, int(time.time()), -1])

  ntempdf =  pd.DataFrame(data=nTempData, columns=['userId', 'movieId', 'timestamp', 'interact' ])      
  nsamples = pd.concat([nsamples, ntempdf],ignore_index=True) 
  return nsamples

#get negative samples to ns var. Later can save it to train/test/. The generation data is quite slow at version 1.
ns = neg_sampling(rdata)
print("done: ", ns.shape)

processed ... 0.00% ...0.00secs
processed ... 5.00% ...28.44secs
processed ... 10.00% ...28.18secs
processed ... 15.00% ...28.13secs
processed ... 20.00% ...28.22secs
processed ... 25.00% ...27.91secs
processed ... 30.00% ...27.92secs
processed ... 35.00% ...28.18secs
processed ... 40.00% ...28.21secs
processed ... 45.00% ...27.66secs
processed ... 50.00% ...27.70secs
processed ... 55.00% ...27.47secs
processed ... 60.00% ...27.53secs
processed ... 65.00% ...27.46secs
processed ... 70.00% ...27.21secs
processed ... 75.00% ...27.36secs
processed ... 80.00% ...27.68secs
processed ... 85.00% ...27.20secs
processed ... 90.00% ...26.99secs
processed ... 95.00% ...27.02secs
done:  (300000, 4)

ns.tail(5)

Create train test eval sets for ORIGINAL RATINGS only!!!¶

In ml100k, there are some train/test sets avaiables

#From rating, drop timestamp

rX_train, rX_test = train_test_split(rdata[['userId', 'movieId', 'rating']], test_size=0.2, random_state = 101)
rX_train, rX_val = train_test_split(rX_train, test_size=0.2, random_state= 101)
print(rX_train.shape, rX_test.shape, rX_val.shape)

(51200, 3) (16000, 3) (12800, 3)

# Check the train set
rX_train.head()

Create train test eval sets for Negative sampling from RATINGS only!!!¶

rX_train_ns, rX_test_ns = train_test_split(ns, test_size=0.2, random_state = 101)
rX_train_ns, rX_val_ns = train_test_split(rX_train_ns, test_size=0.2, random_state= 101)
print(rX_train_ns.shape, rX_test_ns.shape, rX_val_ns.shape)

(192000, 4) (60000, 4) (48000, 4)

# Check the train set ns
rX_train_ns.head()

Save processed train, test, eval data frame to files¶

def save_to_csv(data_frames, out_file_names, outdir='./sample_data/processed/'):
  if not os.path.exists(outdir):
    os.mkdir(outdir)
  assert len(data_frames)==len(out_file_names), "number of dataframes must equal number of file names"
  for i in range(len(out_file_names)):
    data_frames[i].to_csv(outdir+out_file_names[i], header=None, index=False)

#Train, test, validation of original data as CSV
save_to_csv([rX_train, rX_test, rX_val], ['rX_train.csv', 'rX_test.csv', 'rx_val.csv'] )

Train, test, validation of neg_sampling data as CSV¶

save_to_csv([rX_train_ns, rX_test_ns, rX_val_ns], ['rX_train_ns.csv', 'rX_test_ns.csv', 'rX_val_ns.csv'] )

Transform train, test, eval to libfm, svmlight, libsvm format as *.libfm¶

def save_to_libfm(data_frames, out_file_names, target='rating', outdir='./sample_data/processed/'):
  if not os.path.exists(outdir):
    os.mkdir(outdir)
  assert len(data_frames)==len(out_file_names), "number of dataframes must equal number of file names"
  for i in range(len(out_file_names)):
    dump_svmlight_file(data_frames[i][['userId', 'movieId', 'timestamp']],data_frames[i][[target]].values.ravel(), outdir+out_file_names[i],zero_based=True,multilabel=False)

#orginal data
save_to_libfm([rX_train, rX_test, rX_val], ['rX_train.libfm', 'rX_test.libfm', 'rx_val.libfm'])

#negative sampled data
save_to_libfm([rX_train_ns, rX_test_ns, rX_val_ns], ['rX_train_ns.libfm', 'rX_test_ns.libfm', 'rx_val_ns.libfm'], target='interact')

Zip processed folder and download to local drive¶

import shutil
zipFileName = 'ml_processed'
shutil.make_archive(zipFileName, 'zip', outdir)

'/content/ml_processed.zip'

#download file to local system
from google.colab import files
files.download(zipFileName+'.zip')

Or save to GG drive¶

path =./ in drive

# Import PyDrive and associated libraries.
# This only needs to be done once in a notebook.
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

# Authenticate and create the PyDrive client.
# This only needs to be done once in a notebook.
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

# Create & upload a text file.
uploaded = drive.CreateFile({'title': zipFileName+'.zip'})
uploaded.SetContentFile(zipFileName+'.zip')
uploaded.Upload()
print('Uploaded file with ID {}'.format(uploaded.get('id')))

References¶

Libfm applicatons: https://support.sas.com/resources/papers/proceedings17/SAS0388-2017.pdf

Analysis ML: https://www.kaggle.com/jneupane12/analysis-of-movielens-dataset-beginner-sanalysis

LibSVM: https://www.csie.ntu.edu.tw/~cjlin/libsvm/faq.html#f402

	userId	movieId	rating	timestamp
0	196	242	3	881250949
1	186	302	3	891717742
2	22	377	1	878887116
3	244	51	2	880606923
4	166	346	1	886397596

	user id	age	gender	occupation	zip code
0	1	24	M	technician	85711
1	2	53	F	other	94043
2	3	23	M	writer	32067
3	4	24	M	technician	43537
4	5	33	F	other	15213

	userId	movieId	tag	timestamp
0	2	60756	funny	1445714994
1	2	60756	Highly quotable	1445714996
2	2	60756	will ferrell	1445714992
3	2	89774	Boxing story	1445715207
4	2	89774	MMA	1445715200

	userId	movieId	rating
73060	473	1895	2.0
63194	414	2671	4.0
67862	438	3968	2.5
25263	177	3624	3.0
100192	610	45210	4.0

	userId	movieId	interact
89651	315	202	1
51626	716	238	1
267896	863	142	-1
16601	418	301	1
183748	203	1053	-1

	userId	movieId	interact
299995	276	660	-1
299996	13	1318	-1
299997	13	1014	-1
299998	12	810	-1
299999	12	1337	-1