import sys
import pandas as pd
import numpy as np
import scipy.sparse as sparse
from scipy.sparse.linalg import spsolve
import random

from sklearn.preprocessing import MinMaxScaler

import implicit # The Cython library

Load dataset

!ls ../jobs/data/*.tsv
../jobs/data/apps.tsv	     ../jobs/data/user_history.tsv
../jobs/data/jobs.tsv	     ../jobs/data/users.tsv
../jobs/data/test_users.tsv  ../jobs/data/window_dates.tsv

path = '../jobs/data/'
users = pd.read_csv(path+'users.tsv', sep='\t', encoding='utf-8')
jobs = pd.read_csv(path+'jobs.tsv', sep='\t', encoding='utf-8', error_bad_lines=False)
apps = pd.read_csv(path+'apps.tsv', sep='\t', encoding='utf-8')
user_history = pd.read_csv(path+'user_history.tsv', sep='\t', encoding='utf-8')
test_users = pd.read_csv(path+'test_users.tsv', sep='\t', encoding='utf-8')
b'Skipping line 122433: expected 11 fields, saw 12\n'
b'Skipping line 602576: expected 11 fields, saw 12\n'
b'Skipping line 990950: expected 11 fields, saw 12\n'
/home/hectoryee/anaconda3/lib/python3.6/site-packages/IPython/core/ DtypeWarning: Columns (8) have mixed types. Specify dtype option on import or set low_memory=False.
  interactivity=interactivity, compiler=compiler, result=result)

JobID WindowID Title Description Requirements City State Country Zip5 StartDate EndDate
0 1 1 Security Engineer/Technical Lead <p>Security Clearance Required:&nbsp; Top Secr... <p>SKILL SET</p>\r<p>&nbsp;</p>\r<p>Network Se... Washington DC US 20531 2012-03-07 13:17:01.643 2012-04-06 23:59:59
1 4 1 SAP Business Analyst / WM <strong>NO Corp. to Corp resumes&nbsp;are bein... <p><b>WHAT YOU NEED: </b></p>\r<p>Four year co... Charlotte NC US 28217 2012-03-21 02:03:44.137 2012-04-20 23:59:59
2 7 1 P/T HUMAN RESOURCES ASSISTANT <b> <b> P/T HUMAN RESOURCES ASSISTANT</b> <... Please refer to the Job Description to view th... Winter Park FL US 32792 2012-03-02 16:36:55.447 2012-04-01 23:59:59
3 8 1 Route Delivery Drivers CITY BEVERAGES Come to work for the best in th... Please refer to the Job Description to view th... Orlando FL US NaN 2012-03-03 09:01:10.077 2012-04-02 23:59:59
4 9 1 Housekeeping I make sure every part of their day is magica... Please refer to the Job Description to view th... Orlando FL US NaN 2012-03-03 09:01:11.88 2012-04-02 23:59:59


Splitting into Training and Testing dataset

currently not using this

users_training = users.loc[users['Split'] == 'Train']
users_testing = users.loc[users['Split'] == 'Test']

apps_training = apps.loc[apps['Split'] == 'Train']
apps_testing = apps.loc[apps['Split'] == 'Test']

user_history_training = user_history.loc[user_history['Split'] == 'Train']
user_history_testing = user_history.loc[user_history['Split'] == 'Test']


  • users_training
  • users_testing
  • apps_training
  • apps_testing
  • user_history_training
  • user_history_testing

jobs_US = jobs.loc[jobs['Country'] == 'US']
jobs_US_base_line = jobs_US.iloc[0:10000,0:8]

jobs_US_base_line['Title'] = jobs_US_base_line['Title'].fillna('')
jobs_US_base_line['Description'] = jobs_US_base_line['Description'].fillna('')
#jobs_US_base_line['Requirements'] = jobs_US_base_line['Requirements'].fillna('')

jobs_US_base_line['Description'] = jobs_US_base_line['Title'] + jobs_US_base_line['Description']

From here onwards use jobs_US_base_line data frame to work on, which is selected by jobs_US.iloc[0:10000,0:8].

Clean html

import re

def preprocessor(text):
    text = text.replace('\\r', '').replace('&nbsp', '').replace('\n', '')
    text = re.sub('<[^>]*>', '', text)
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text)
    text = re.sub('[\W]+', ' ', text.lower()) +\
        ' '.join(emoticons).replace('-', '')
    return text

jobs_US_base_line['Description'] = jobs_US_base_line['Description'].astype(dtype='str').apply(preprocessor)

UserID WindowID Split ApplicationDate JobID
0 47 1 Train 2012-04-04 15:56:23.537 169528
1 47 1 Train 2012-04-06 01:03:00.003 284009
2 47 1 Train 2012-04-05 02:40:27.753 2121
3 47 1 Train 2012-04-05 02:37:02.673 848187
4 47 1 Train 2012-04-05 22:44:06.653 733748

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1603111 entries, 0 to 1603110
Data columns (total 5 columns):
UserID             1603111 non-null int64
WindowID           1603111 non-null int64
Split              1603111 non-null object
ApplicationDate    1603111 non-null object
JobID              1603111 non-null int64
dtypes: int64(3), object(2)
memory usage: 61.2+ MB

application = apps_training[['UserID', 'JobID']]

grouped_apps = application.groupby(['UserID', 'JobID']).sum().reset_index()

grouped_apps['Quantity'] = 1

UserID JobID Quantity
0 7 309823 1
1 7 703889 1
2 9 136489 1
3 9 617374 1
4 9 809208 1


JobID WindowID Title Description Requirements City State Country
0 1 1 Security Engineer/Technical Lead security engineer technical leadsecurity clear... <p>SKILL SET</p>\r<p>&nbsp;</p>\r<p>Network Se... Washington DC US
1 4 1 SAP Business Analyst / WM sap business analyst wmno corp to corp resumes... <p><b>WHAT YOU NEED: </b></p>\r<p>Four year co... Charlotte NC US
2 7 1 P/T HUMAN RESOURCES ASSISTANT p t human resources assistant p t human resour... Please refer to the Job Description to view th... Winter Park FL US
3 8 1 Route Delivery Drivers route delivery driverscity beverages come to w... Please refer to the Job Description to view th... Orlando FL US
4 9 1 Housekeeping housekeepingi make sure every part of their da... Please refer to the Job Description to view th... Orlando FL US

job_lookup = jobs[['JobID','Title']].drop_duplicates()
job_lookup['Title'] = job_lookup.Title.astype(str)

JobID Title
0 1 Security Engineer/Technical Lead
1 4 SAP Business Analyst / WM
3 8 Route Delivery Drivers
4 9 Housekeeping

Create the sparse ratings matrix of users and items utilizing the code

users = list(np.sort(grouped_apps.UserID.unique()))
jobs = list(grouped_apps.JobID.unique())
application = list(grouped_apps.Quantity)

rows = grouped_apps.UserID.astype('category', categories = users)
cols = grouped_apps.JobID.astype('category', categories = jobs)

apps_sparse = sparse.csr_matrix((application, (rows, cols)), shape=(len(users), len(jobs)))
/home/hectoryee/anaconda3/lib/python3.6/site-packages/ FutureWarning: specifying 'categories' or 'ordered' in .astype() is deprecated; pass a CategoricalDtype instead
/home/hectoryee/anaconda3/lib/python3.6/site-packages/ FutureWarning: specifying 'categories' or 'ordered' in .astype() is deprecated; pass a CategoricalDtype instead

<308033x349742 sparse matrix of type '<class 'numpy.int64'>'
	with 1417514 stored elements in Compressed Sparse Row format>

matrix_size = apps_sparse.shape[0] * apps_sparse.shape[1]
num_apps = len(apps_sparse.nonzero()[0])
sparsity = 100 * (1 - (num_apps/matrix_size))

98.3% of the interaction matrix is sparse. For collaborative filtering to work, the maximum sparsity you could get away with would probably be about 99.5% or so. We are above this, might not get decent results.


def make_train(ratings, pct_test = 0.2):
    This function will take in the original user-item matrix and "mask" a percentage of the original ratings where a
    user-item interaction has taken place for use as a test set. The test set will contain all of the original ratings, 
    while the training set replaces the specified percentage of them with a zero in the original ratings matrix. 
    ratings - the original ratings matrix from which you want to generate a train/test set. Test is just a complete
    copy of the original set. This is in the form of a sparse csr_matrix. 
    pct_test - The percentage of user-item interactions where an interaction took place that you want to mask in the 
    training set for later comparison to the test set, which contains all of the original ratings. 
    training_set - The altered version of the original data with a certain percentage of the user-item pairs 
    that originally had interaction set back to zero.
    test_set - A copy of the original ratings matrix, unaltered, so it can be used to see how the rank order 
    compares with the actual interactions.
    user_inds - From the randomly selected user-item indices, which user rows were altered in the training data.
    This will be necessary later when evaluating the performance via AUC.
    test_set = ratings.copy() # Make a copy of the original set to be the test set. 
    test_set[test_set != 0] = 1 # Store the test set as a binary preference matrix
    training_set = ratings.copy() # Make a copy of the original data we can alter as our training set. 
    nonzero_inds = training_set.nonzero() # Find the indices in the ratings data where an interaction exists
    nonzero_pairs = list(zip(nonzero_inds[0], nonzero_inds[1])) # Zip these pairs together of user,item index into list
    random.seed(0) # Set the random seed to zero for reproducibility
    num_samples = int(np.ceil(pct_test*len(nonzero_pairs))) # Round the number of samples needed to the nearest integer
    samples = random.sample(nonzero_pairs, num_samples) # Sample a random number of user-item pairs without replacement
    user_inds = [index[0] for index in samples] # Get the user row indices
    item_inds = [index[1] for index in samples] # Get the item column indices
    training_set[user_inds, item_inds] = 0 # Assign all of the randomly chosen user-item pairs to zero
    training_set.eliminate_zeros() # Get rid of zeros in sparse array storage after update to save space
    return training_set, test_set, list(set(user_inds)) # Output the unique list of user rows that were altered  

product_train, product_test, product_users_altered = make_train(apps_sparse, pct_test = 0.2)

Implement implicit

import implicit

alpha = 15
user_vecs, item_vecs = implicit.alternating_least_squares((product_train*alpha).astype('double'), 
                                                          regularization = 0.1, 
                                                         iterations = 50)
This method is deprecated. Please use the AlternatingLeastSquares class instead
WARNING:root:Intel MKL BLAS detected. Its highly recommend to set the environment variable 'export MKL_NUM_THREADS=1' to disable its internal multithreading
100%|██████████| 50.0/50 [00:30<00:00,  1.44it/s]

A Recommendation Example

users_arr = np.array(users) # Array of customer IDs from the ratings matrix
jobs_arr = np.array(jobs ) # Array of product IDs from the ratings matrix

def get_items_purchased(customer_id, mf_train, customers_list, products_list, item_lookup):
    This just tells me which items have been already purchased by a specific user in the training set. 
    customer_id - Input the customer's id number that you want to see prior purchases of at least once
    mf_train - The initial ratings training set used (without weights applied)
    customers_list - The array of customers used in the ratings matrix
    products_list - The array of products used in the ratings matrix
    item_lookup - A simple pandas dataframe of the unique product ID/product descriptions available
    A list of item IDs and item descriptions for a particular customer that were already purchased in the training set
    cust_ind = np.where(customers_list == customer_id)[0][0] # Returns the index row of our customer id
    purchased_ind = mf_train[cust_ind,:].nonzero()[1] # Get column indices of purchased items
    prod_codes = products_list[purchased_ind] # Get the stock codes for our purchased items
#     return prod_codes
    return item_lookup.loc[item_lookup.JobID.isin(prod_codes)]

In [32]:

array([ 7,  9, 14, 16, 18])

In [33]:
<class 'pandas.core.frame.DataFrame'>
Int64Index: 1091923 entries, 0 to 1091922
Data columns (total 2 columns):
JobID    1091923 non-null int64
Title    1091923 non-null object
dtypes: int64(1), object(1)
memory usage: 25.0+ MB

get_items_purchased(14, product_train, users_arr, jobs_arr, job_lookup)
JobID Title
605671 206046 Materials Manager / Director
626723 372423 Sales Manager
652060 574999 Mortgage Loan Specialist
663486 663552 Mortgage Professionals - All Levels
678829 787741 Planner Specialist
702882 978868 Branch Coordinator

from sklearn.preprocessing import MinMaxScaler

def rec_items(customer_id, mf_train, user_vecs, item_vecs, customer_list, item_list, item_lookup, num_items = 10):
    This function will return the top recommended items to our users 
    customer_id - Input the customer's id number that you want to get recommendations for
    mf_train - The training matrix you used for matrix factorization fitting
    user_vecs - the user vectors from your fitted matrix factorization
    item_vecs - the item vectors from your fitted matrix factorization
    customer_list - an array of the customer's ID numbers that make up the rows of your ratings matrix 
                    (in order of matrix)
    item_list - an array of the products that make up the columns of your ratings matrix
                    (in order of matrix)
    item_lookup - A simple pandas dataframe of the unique product ID/product descriptions available
    num_items - The number of items you want to recommend in order of best recommendations. Default is 10. 
    - The top n recommendations chosen based on the user/item vectors for items never interacted with/purchased
    cust_ind = np.where(customer_list == customer_id)[0][0] # Returns the index row of our customer id
    pref_vec = mf_train[cust_ind,:].toarray() # Get the ratings from the training set ratings matrix
    pref_vec = pref_vec.reshape(-1) + 1 # Add 1 to everything, so that items not purchased yet become equal to 1
    pref_vec[pref_vec > 1] = 0 # Make everything already purchased zero
    rec_vector = user_vecs[cust_ind,:].dot(item_vecs.T) # Get dot product of user vector and all item vectors
    # Scale this recommendation vector between 0 and 1
    min_max = MinMaxScaler()
    rec_vector_scaled = min_max.fit_transform(rec_vector.reshape(-1,1))[:,0] 
    recommend_vector = pref_vec*rec_vector_scaled 
    # Items already purchased have their recommendation multiplied by zero
    product_idx = np.argsort(recommend_vector)[::-1][:num_items] # Sort the indices of the items into order 
    # of best recommendations
    rec_list = [] # start empty list to store items
    for index in product_idx:
        code = item_list[index]
        rec_list.append([code, item_lookup.Title.loc[item_lookup.JobID == code].iloc[0]]) 
        # Append our descriptions to the list
    codes = [item[0] for item in rec_list]
    descriptions = [item[1] for item in rec_list]
    final_frame = pd.DataFrame({'JobId': codes, 'Title': descriptions}) # Create a dataframe 
    return final_frame[['JobId', 'Title']] # Switch order of columns around

get_items_purchased(7, product_train, users_arr, jobs_arr, job_lookup)
JobID Title
479106 309823 Teller
527966 703889 Customer Service Representative, Now Accepting...

rec_items(7, product_train, user_vecs, item_vecs, users_arr, jobs_arr, job_lookup,
                       num_items = 10)
JobId Title
0 1101648 Customer Service/Receptionist/Front Desk
1 12561 Dental Hygienist - up to $40/ hour!
2 802205 Customer Service Representative
3 344145 Customer Service Representative
4 1050711 Customer Service Representative
5 601021 Customer Service/Receptionist/Call Center Rep
6 601126 Customer Service/Front Desk Help/Receptionist
7 394969 Call Center Customer Service Collections / Credit
8 287182 Administrative Assistant
9 1046722 Data Entry Associate

Second Implementation using implicit library.

from implicit.als import AlternatingLeastSquares

# train model
model = AlternatingLeastSquares(factors=50,

confidence = 40 * apps_sparse)
100%|██████████| 50.0/50 [00:51<00:00,  1.07it/s]

# recommend items for a user
user_items = apps_sparse.T.tocsr()
recommendations = model.recommend(7, user_items)

# find related items
# related = model.similar_items(itemid)

[(57371, 0.029422828555080816),
 (95894, 0.028675761246652852),
 (168777, 0.026280032617673803),
 (186945, 0.025853081049481193),
 (6434, 0.022265122614712515),
 (215515, 0.02175052321349273),
 (286526, 0.02006039079638639),
 (188385, 0.019824223520708895),
 (306468, 0.019819373496054445),
 (122115, 0.019597172209618055)]

for i in range(len(recommendations)):

rec_items(7, product_train, user_vecs, item_vecs, users_arr, jobs_arr, job_lookup,
                       num_items = 10)
JobId Title
0 1101648 Customer Service/Receptionist/Front Desk
1 12561 Dental Hygienist - up to $40/ hour!
2 802205 Customer Service Representative
3 344145 Customer Service Representative
4 1050711 Customer Service Representative
5 601021 Customer Service/Receptionist/Call Center Rep
6 601126 Customer Service/Front Desk Help/Receptionist
7 394969 Call Center Customer Service Collections / Credit
8 287182 Administrative Assistant
9 1046722 Data Entry Associate

JobID Title
14973 57371 Termite Inspector (147-480)
72738 286526 Customer Service Representative/E-Marketing Co...
308148 168777 A/R Medical Biller
310916 186945 Wireless Project Manager
441140 6434 CNC Machinist
451716 95894 Part-Time Customer Service Representative
466763 215515 Product Specialist!!! - Excellent Opportunity!
594338 122115 Maintenance Technician
893325 306468 Data / Business Analyst – Deliver Relevance th...
996719 188385 Assistant Store Manager - Casual Male XL

get_items_purchased(7, product_train, users_arr, jobs_arr, job_lookup)
JobID Title
479106 309823 Teller
527966 703889 Customer Service Representative, Now Accepting...


