@Author: Tung Nguyen

@Version: 1.2

Import section

In [0]:
#@title
from sklearn.datasets import dump_svmlight_file
import numpy as np
import pandas as pd
import os
import urllib
import zipfile
from sklearn.model_selection import train_test_split
import shutil
In [0]:
#@title
datasets = {'ml100k':'http://files.grouplens.org/datasets/movielens/ml-100k.zip',
            'ml20m':'http://files.grouplens.org/datasets/movielens/ml-20m.zip',
            'mllatestsmall':'http://files.grouplens.org/datasets/movielens/ml-latest-small.zip',
            'ml10m':'http://files.grouplens.org/datasets/movielens/ml-10m.zip',
            'ml1m':'http://files.grouplens.org/datasets/movielens/ml-1m.zip'
            }

Select dataset to process

Note: you need to type in your dataset in the input box

In [0]:
print('Available datasets: ', [key for key in datasets])
dt = input('Dataset name = ')
print('You selected {}'.format(dt))
Available datasets:  ['ml100k', 'ml20m', 'mllatestsmall', 'ml10m', 'ml1m']
Dataset name = ml100k
You selected ml100k

Download and extract

In [0]:
# Download the file from `url` and save it locally under `file_name`:
dt_name = os.path.basename(datasets[dt])

print('Downloading {}'.format(dt_name))
with urllib.request.urlopen(datasets[dt]) as response, open('./sample_data/'+dt_name, 'wb') as out_file:
    shutil.copyfileobj(response, out_file)
print('Download completed')
Downloading ml-100k.zip
Download completed
In [0]:
with zipfile.ZipFile('./sample_data/'+dt_name, 'r') as zip_ref:
    zip_ref.extractall('./sample_data/')
dt_dir_name = os.path.splitext(dt_name)[0]
In [0]:
#Check unzipped structure
def list_files(startpath):
    print(startpath)
    for root, dirs, files in os.walk(startpath):
        level = root.replace(startpath, '').count(os.sep)
        indent = ' ' * 4 * (level)
        print('{}{}/'.format(indent, os.path.basename(root)))
        subindent = ' ' * 4 * (level + 1)
        for f in files:
            print('{}{}'.format(subindent, f))
dirs = [x[0] for x in os.walk("./sample_data")]
#print(list(dirs[0]))
ml = filter(lambda dirName: dirName if ('ml' in dirName) else '', list(dirs))
dt_dir_name= list(ml)[0]
print(dt_dir_name)
#list_files('./sample_data/'+dt_dir_name +'/')
./sample_data/ml-100k

Manual section

You may need to select the data file to check the format

In [0]:
# lets explore movies.csv
if dt=='ml100k':
  rdata= pd.read_csv(dt_dir_name +'/'+ 'u.data', delimiter='\t', names=['userId', 'movieId', 'rating', 'timestamp'])
  usrdata= pd.read_csv(dt_dir_name +'/'+ 'u.user', delimiter='|', names=['user id', 'age' ,'gender' ,'occupation' , 'zip code'])
  rdata.shape
elif dt=='mllatestsmall':
  rdata=pd.read_csv(dt_dir_name +'/'+ 'ratings.csv')#, delimiter='\t'), names=['userid', 'movieId', 'rating', 'timestamp'])  
  tagdata=pd.read_csv(dt_dir_name +'/'+ 'tags.csv')
  rdata.shape
elif dt=='ml1m':
  rdata=pd.read_csv(dt_dir_name +'/'+ 'ratings.dat', delimiter='\:\:', names=['userId', 'movieId', 'rating', 'timestamp'])  
  
elif dt=='ml20m':
  rdata=pd.read_csv(dt_dir_name +'/'+ 'ratings.csv')#, delimiter=',', names=['userid', 'movieId', 'rating', 'timestamp'])  
  tagdata=pd.read_csv(dt_dir_name +'/'+ 'tags.csv')
elif dt=='ml10m':
  rdata=pd.read_csv(dt_dir_name +'/'+ 'ratings.dat', delimiter='\:\:', names=['userId', 'movieId', 'rating', 'timestamp'])

Optional: Check rating data

In [0]:
rdata.head()
Out[0]:
userId movieId rating timestamp
0 196 242 3 881250949
1 186 302 3 891717742
2 22 377 1 878887116
3 244 51 2 880606923
4 166 346 1 886397596
In [0]:
rdata.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100836 entries, 0 to 100835
Data columns (total 4 columns):
userId       100836 non-null int64
movieId      100836 non-null int64
rating       100836 non-null float64
timestamp    100836 non-null int64
dtypes: float64(1), int64(3)
memory usage: 3.1 MB
In [0]:
#is any row null
rdata.isnull().any()
Out[0]:
userId       False
movieId      False
rating       False
timestamp    False
dtype: bool

Check user data

In [0]:
usrdata.head()
Out[0]:
user id age gender occupation zip code
0 1 24 M technician 85711
1 2 53 F other 94043
2 3 23 M writer 32067
3 4 24 M technician 43537
4 5 33 F other 15213
In [0]:
usrdata.isnull().any()
Out[0]:
user id       False
age           False
gender        False
occupation    False
zip code      False
dtype: bool

Check tag data

If available (latest)

In [0]:
tagdata.head()
Out[0]:
userId movieId tag timestamp
0 2 60756 funny 1445714994
1 2 60756 Highly quotable 1445714996
2 2 60756 will ferrell 1445714992
3 2 89774 Boxing story 1445715207
4 2 89774 MMA 1445715200
In [0]:
tagdata.isnull().any()
Out[0]:
userId       False
movieId      False
tag          False
timestamp    False
dtype: bool

Optional: Negative Sampling set

Can optimize for faster generation

(Check out V.1.1 below)

Negative Sampling V1.1 (faster)

In [0]:
#Version 1.1
import random
import time

def neg_sampling(ratings):
  """version 1.1:1 positive 2 neg (3 times bigger than the original dataset)

    Parameters:
    input rating data as pandas dataframe: userId|movieId|rating

    Returns:
    negative sampled set as pandas dataframe
            userId|movieId|interact (implicit)
  """
  users = ratings['userId'].drop_duplicates() #get list uid
  movies = ratings['movieId'].drop_duplicates() #get list mid
  nsamples = ratings[['userId', 'movieId', 'timestamp']]
  nsamples['interact'] = nsamples.apply(lambda row: 1, axis=1)
  nTempData = []
  start_time = time.time()
  stop_time = time.time()
  for i, row in ratings.iterrows():
    if(i%5000==0):
      stop_time = time.time()
      print("processed ... {0:0.2f}% ...{1:0.2f}secs".format(float(i)*100 / len(ratings), stop_time - start_time))
      start_time = stop_time
    u=row['userId']
    j = 2
    while j > 0:
      m= movies.sample(n=1).values[0] #get random movie id
      # insert 2 negative samples if u-m not exist 
      if(not ((nsamples['userId']==u) & (nsamples['movieId']==m)).any()):
          j -= 1
          nTempData.append([u, m, int(time.time()), -1])

  ntempdf =  pd.DataFrame(data=nTempData, columns=['userId', 'movieId', 'timestamp', 'interact' ])      
  nsamples = pd.concat([nsamples, ntempdf],ignore_index=True) 
  return nsamples
In [0]:
#get negative samples to ns var. Later can save it to train/test/. The generation data is quite slow at version 1.
ns = neg_sampling(rdata)
print("done: ", ns.shape)
processed ... 0.00% ...0.00secs
processed ... 5.00% ...28.44secs
processed ... 10.00% ...28.18secs
processed ... 15.00% ...28.13secs
processed ... 20.00% ...28.22secs
processed ... 25.00% ...27.91secs
processed ... 30.00% ...27.92secs
processed ... 35.00% ...28.18secs
processed ... 40.00% ...28.21secs
processed ... 45.00% ...27.66secs
processed ... 50.00% ...27.70secs
processed ... 55.00% ...27.47secs
processed ... 60.00% ...27.53secs
processed ... 65.00% ...27.46secs
processed ... 70.00% ...27.21secs
processed ... 75.00% ...27.36secs
processed ... 80.00% ...27.68secs
processed ... 85.00% ...27.20secs
processed ... 90.00% ...26.99secs
processed ... 95.00% ...27.02secs
done:  (300000, 4)
In [0]:
ns.tail(5)
Out[0]:
userId movieId interact
299995 276 660 -1
299996 13 1318 -1
299997 13 1014 -1
299998 12 810 -1
299999 12 1337 -1

Create train test eval sets for ORIGINAL RATINGS only!!!

In ml100k, there are some train/test sets avaiables

In [0]:
#From rating, drop timestamp

rX_train, rX_test = train_test_split(rdata[['userId', 'movieId', 'rating']], test_size=0.2, random_state = 101)
rX_train, rX_val = train_test_split(rX_train, test_size=0.2, random_state= 101)
print(rX_train.shape, rX_test.shape, rX_val.shape)
(51200, 3) (16000, 3) (12800, 3)
In [0]:
# Check the train set
rX_train.head()
Out[0]:
userId movieId rating
73060 473 1895 2.0
63194 414 2671 4.0
67862 438 3968 2.5
25263 177 3624 3.0
100192 610 45210 4.0

Create train test eval sets for Negative sampling from RATINGS only!!!

In [0]:
rX_train_ns, rX_test_ns = train_test_split(ns, test_size=0.2, random_state = 101)
rX_train_ns, rX_val_ns = train_test_split(rX_train_ns, test_size=0.2, random_state= 101)
print(rX_train_ns.shape, rX_test_ns.shape, rX_val_ns.shape)
(192000, 4) (60000, 4) (48000, 4)
In [0]:
# Check the train set ns
rX_train_ns.head()
Out[0]:
userId movieId interact
89651 315 202 1
51626 716 238 1
267896 863 142 -1
16601 418 301 1
183748 203 1053 -1

Save processed train, test, eval data frame to files

In [0]:
def save_to_csv(data_frames, out_file_names, outdir='./sample_data/processed/'):
  if not os.path.exists(outdir):
    os.mkdir(outdir)
  assert len(data_frames)==len(out_file_names), "number of dataframes must equal number of file names"
  for i in range(len(out_file_names)):
    data_frames[i].to_csv(outdir+out_file_names[i], header=None, index=False)
In [0]:
#Train, test, validation of original data as CSV
save_to_csv([rX_train, rX_test, rX_val], ['rX_train.csv', 'rX_test.csv', 'rx_val.csv'] )

Train, test, validation of neg_sampling data as CSV

In [0]:
save_to_csv([rX_train_ns, rX_test_ns, rX_val_ns], ['rX_train_ns.csv', 'rX_test_ns.csv', 'rX_val_ns.csv'] )

Transform train, test, eval to libfm, svmlight, libsvm format as *.libfm

In [0]:
def save_to_libfm(data_frames, out_file_names, target='rating', outdir='./sample_data/processed/'):
  if not os.path.exists(outdir):
    os.mkdir(outdir)
  assert len(data_frames)==len(out_file_names), "number of dataframes must equal number of file names"
  for i in range(len(out_file_names)):
    dump_svmlight_file(data_frames[i][['userId', 'movieId', 'timestamp']],data_frames[i][[target]].values.ravel(), outdir+out_file_names[i],zero_based=True,multilabel=False)
In [0]:
#orginal data
save_to_libfm([rX_train, rX_test, rX_val], ['rX_train.libfm', 'rX_test.libfm', 'rx_val.libfm'])
In [0]:
#negative sampled data
save_to_libfm([rX_train_ns, rX_test_ns, rX_val_ns], ['rX_train_ns.libfm', 'rX_test_ns.libfm', 'rx_val_ns.libfm'], target='interact')

Zip processed folder and download to local drive

In [0]:
import shutil
zipFileName = 'ml_processed'
shutil.make_archive(zipFileName, 'zip', outdir)
Out[0]:
'/content/ml_processed.zip'
In [0]:
#download file to local system
from google.colab import files
files.download(zipFileName+'.zip')

Or save to GG drive

path =./ in drive

In [0]:
# Import PyDrive and associated libraries.
# This only needs to be done once in a notebook.
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

# Authenticate and create the PyDrive client.
# This only needs to be done once in a notebook.
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

# Create & upload a text file.
uploaded = drive.CreateFile({'title': zipFileName+'.zip'})
uploaded.SetContentFile(zipFileName+'.zip')
uploaded.Upload()
print('Uploaded file with ID {}'.format(uploaded.get('id')))