Job Recommendation Engine with Implicit Feedback

In [1]:

import sys
import pandas as pd
import numpy as np
import scipy.sparse as sparse
from scipy.sparse.linalg import spsolve
import random

from sklearn.preprocessing import MinMaxScaler

import implicit # The Cython library

Load dataset

In [2]:

!ls ../jobs/data/*.tsv

../jobs/data/apps.tsv	     ../jobs/data/user_history.tsv
../jobs/data/jobs.tsv	     ../jobs/data/users.tsv
../jobs/data/test_users.tsv  ../jobs/data/window_dates.tsv

In [3]:

path = '../jobs/data/'
users = pd.read_csv(path+'users.tsv', sep='\t', encoding='utf-8')
jobs = pd.read_csv(path+'jobs.tsv', sep='\t', encoding='utf-8', error_bad_lines=False)
apps = pd.read_csv(path+'apps.tsv', sep='\t', encoding='utf-8')
user_history = pd.read_csv(path+'user_history.tsv', sep='\t', encoding='utf-8')
test_users = pd.read_csv(path+'test_users.tsv', sep='\t', encoding='utf-8')

b'Skipping line 122433: expected 11 fields, saw 12\n'
b'Skipping line 602576: expected 11 fields, saw 12\n'
b'Skipping line 990950: expected 11 fields, saw 12\n'
/home/hectoryee/anaconda3/lib/python3.6/site-packages/IPython/core/interactiveshell.py:2785: DtypeWarning: Columns (8) have mixed types. Specify dtype option on import or set low_memory=False.
  interactivity=interactivity, compiler=compiler, result=result)

In [4]:

jobs.replace('NaN',np.NaN)
jobs.head()

	JobID	WindowID	Title	Description	Requirements	City	State	Country	Zip5	StartDate	EndDate
0	1	1	Security Engineer/Technical Lead	<p>Security Clearance Required:  Top Secr...	<p>SKILL SET</p>\r<p> </p>\r<p>Network Se...	Washington	DC	US	20531	2012-03-07 13:17:01.643	2012-04-06 23:59:59
1	4	1	SAP Business Analyst / WM	<strong>NO Corp. to Corp resumes are bein...	<p><b>WHAT YOU NEED: </b></p>\r<p>Four year co...	Charlotte	NC	US	28217	2012-03-21 02:03:44.137	2012-04-20 23:59:59
2	7	1	P/T HUMAN RESOURCES ASSISTANT	<b> <b> P/T HUMAN RESOURCES ASSISTANT</b> <...	Please refer to the Job Description to view th...	Winter Park	FL	US	32792	2012-03-02 16:36:55.447	2012-04-01 23:59:59
3	8	1	Route Delivery Drivers	CITY BEVERAGES Come to work for the best in th...	Please refer to the Job Description to view th...	Orlando	FL	US	NaN	2012-03-03 09:01:10.077	2012-04-02 23:59:59
4	9	1	Housekeeping	I make sure every part of their day is magica...	Please refer to the Job Description to view th...	Orlando	FL	US	NaN	2012-03-03 09:01:11.88	2012-04-02 23:59:59

Preprocessing

Splitting into Training and Testing dataset

currently not using this

In [5]:

users_training = users.loc[users['Split'] == 'Train']
users_testing = users.loc[users['Split'] == 'Test']

apps_training = apps.loc[apps['Split'] == 'Train']
apps_testing = apps.loc[apps['Split'] == 'Test']

user_history_training = user_history.loc[user_history['Split'] == 'Train']
user_history_testing = user_history.loc[user_history['Split'] == 'Test']

Dataframes

users_training
users_testing
apps_training
apps_testing
user_history_training
user_history_testing

In [6]:

jobs_US = jobs.loc[jobs['Country'] == 'US']
jobs_US_base_line = jobs_US.iloc[0:10000,0:8]

In [7]:

jobs_US_base_line['Title'] = jobs_US_base_line['Title'].fillna('')
jobs_US_base_line['Description'] = jobs_US_base_line['Description'].fillna('')
#jobs_US_base_line['Requirements'] = jobs_US_base_line['Requirements'].fillna('')

jobs_US_base_line['Description'] = jobs_US_base_line['Title'] + jobs_US_base_line['Description']

From here onwards use jobs_US_base_line data frame to work on, which is selected by jobs_US.iloc[0:10000,0:8].

Clean html

In [8]:

import re

def preprocessor(text):
    text = text.replace('\\r', '').replace('&nbsp', '').replace('\n', '')
    text = re.sub('<[^>]*>', '', text)
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text)
    text = re.sub('[\W]+', ' ', text.lower()) +\
        ' '.join(emoticons).replace('-', '')
    return text

In [9]:

jobs_US_base_line['Description'] = jobs_US_base_line['Description'].astype(dtype='str').apply(preprocessor)

In [10]:

apps.head()

	UserID	WindowID	Split	ApplicationDate	JobID
0	47	1	Train	2012-04-04 15:56:23.537	169528
1	47	1	Train	2012-04-06 01:03:00.003	284009
2	47	1	Train	2012-04-05 02:40:27.753	2121
3	47	1	Train	2012-04-05 02:37:02.673	848187
4	47	1	Train	2012-04-05 22:44:06.653	733748

In [11]:

apps.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1603111 entries, 0 to 1603110
Data columns (total 5 columns):
UserID             1603111 non-null int64
WindowID           1603111 non-null int64
Split              1603111 non-null object
ApplicationDate    1603111 non-null object
JobID              1603111 non-null int64
dtypes: int64(3), object(2)
memory usage: 61.2+ MB

In [12]:

application = apps_training[['UserID', 'JobID']]

In [13]:

grouped_apps = application.groupby(['UserID', 'JobID']).sum().reset_index()

In [14]:

grouped_apps['Quantity'] = 1

In [15]:

grouped_apps.head()

	UserID	JobID	Quantity
0	7	309823	1
1	7	703889	1
2	9	136489	1
3	9	617374	1
4	9	809208	1

Look-up

In [16]:

jobs_US_base_line.head()

	JobID	WindowID	Title	Description	Requirements	City	State	Country
0	1	1	Security Engineer/Technical Lead	security engineer technical leadsecurity clear...	<p>SKILL SET</p>\r<p> </p>\r<p>Network Se...	Washington	DC	US
1	4	1	SAP Business Analyst / WM	sap business analyst wmno corp to corp resumes...	<p><b>WHAT YOU NEED: </b></p>\r<p>Four year co...	Charlotte	NC	US
2	7	1	P/T HUMAN RESOURCES ASSISTANT	p t human resources assistant p t human resour...	Please refer to the Job Description to view th...	Winter Park	FL	US
3	8	1	Route Delivery Drivers	route delivery driverscity beverages come to w...	Please refer to the Job Description to view th...	Orlando	FL	US
4	9	1	Housekeeping	housekeepingi make sure every part of their da...	Please refer to the Job Description to view th...	Orlando	FL	US

In [17]:

job_lookup = jobs[['JobID','Title']].drop_duplicates()
job_lookup['Title'] = job_lookup.Title.astype(str)

In [18]:

job_lookup.head()

	JobID	Title
0	1	Security Engineer/Technical Lead
1	4	SAP Business Analyst / WM
2	7	P/T HUMAN RESOURCES ASSISTANT
3	8	Route Delivery Drivers
4	9	Housekeeping

Create the sparse ratings matrix of users and items utilizing the code

In [19]:

users = list(np.sort(grouped_apps.UserID.unique()))
jobs = list(grouped_apps.JobID.unique())
application = list(grouped_apps.Quantity)

rows = grouped_apps.UserID.astype('category', categories = users).cat.codes
cols = grouped_apps.JobID.astype('category', categories = jobs).cat.codes

apps_sparse = sparse.csr_matrix((application, (rows, cols)), shape=(len(users), len(jobs)))

/home/hectoryee/anaconda3/lib/python3.6/site-packages/ipykernel_launcher.py:5: FutureWarning: specifying 'categories' or 'ordered' in .astype() is deprecated; pass a CategoricalDtype instead
  """
/home/hectoryee/anaconda3/lib/python3.6/site-packages/ipykernel_launcher.py:6: FutureWarning: specifying 'categories' or 'ordered' in .astype() is deprecated; pass a CategoricalDtype instead

In [20]:

(apps_sparse)

<308033x349742 sparse matrix of type '<class 'numpy.int64'>'
	with 1417514 stored elements in Compressed Sparse Row format>

In [21]:

matrix_size = apps_sparse.shape[0] * apps_sparse.shape[1]
num_apps = len(apps_sparse.nonzero()[0])
sparsity = 100 * (1 - (num_apps/matrix_size))
sparsity

99.99868422290457

98.3% of the interaction matrix is sparse. For collaborative filtering to work, the maximum sparsity you could get away with would probably be about 99.5% or so. We are above this, might not get decent results.

Dataset

In [22]:

def make_train(ratings, pct_test = 0.2):
    '''
    This function will take in the original user-item matrix and "mask" a percentage of the original ratings where a
    user-item interaction has taken place for use as a test set. The test set will contain all of the original ratings, 
    while the training set replaces the specified percentage of them with a zero in the original ratings matrix. 
    
    parameters: 
    
    ratings - the original ratings matrix from which you want to generate a train/test set. Test is just a complete
    copy of the original set. This is in the form of a sparse csr_matrix. 
    
    pct_test - The percentage of user-item interactions where an interaction took place that you want to mask in the 
    training set for later comparison to the test set, which contains all of the original ratings. 
    
    returns:
    
    training_set - The altered version of the original data with a certain percentage of the user-item pairs 
    that originally had interaction set back to zero.
    
    test_set - A copy of the original ratings matrix, unaltered, so it can be used to see how the rank order 
    compares with the actual interactions.
    
    user_inds - From the randomly selected user-item indices, which user rows were altered in the training data.
    This will be necessary later when evaluating the performance via AUC.
    '''
    test_set = ratings.copy() # Make a copy of the original set to be the test set. 
    test_set[test_set != 0] = 1 # Store the test set as a binary preference matrix
    training_set = ratings.copy() # Make a copy of the original data we can alter as our training set. 
    nonzero_inds = training_set.nonzero() # Find the indices in the ratings data where an interaction exists
    nonzero_pairs = list(zip(nonzero_inds[0], nonzero_inds[1])) # Zip these pairs together of user,item index into list
    random.seed(0) # Set the random seed to zero for reproducibility
    num_samples = int(np.ceil(pct_test*len(nonzero_pairs))) # Round the number of samples needed to the nearest integer
    samples = random.sample(nonzero_pairs, num_samples) # Sample a random number of user-item pairs without replacement
    user_inds = [index[0] for index in samples] # Get the user row indices
    item_inds = [index[1] for index in samples] # Get the item column indices
    training_set[user_inds, item_inds] = 0 # Assign all of the randomly chosen user-item pairs to zero
    training_set.eliminate_zeros() # Get rid of zeros in sparse array storage after update to save space
    return training_set, test_set, list(set(user_inds)) # Output the unique list of user rows that were altered

In [23]:

product_train, product_test, product_users_altered = make_train(apps_sparse, pct_test = 0.2)

Implement implicit

In [24]:

import implicit

In [25]:

alpha = 15
user_vecs, item_vecs = implicit.alternating_least_squares((product_train*alpha).astype('double'), 
                                                          factors=20, 
                                                          regularization = 0.1, 
                                                         iterations = 50)

This method is deprecated. Please use the AlternatingLeastSquares class instead
WARNING:root:Intel MKL BLAS detected. Its highly recommend to set the environment variable 'export MKL_NUM_THREADS=1' to disable its internal multithreading
100%|██████████| 50.0/50 [00:30<00:00,  1.44it/s]

A Recommendation Example

In [30]:

users_arr = np.array(users) # Array of customer IDs from the ratings matrix
jobs_arr = np.array(jobs ) # Array of product IDs from the ratings matrix

In [31]:

def get_items_purchased(customer_id, mf_train, customers_list, products_list, item_lookup):
    '''
    This just tells me which items have been already purchased by a specific user in the training set. 
    
    parameters: 
    
    customer_id - Input the customer's id number that you want to see prior purchases of at least once
    
    mf_train - The initial ratings training set used (without weights applied)
    
    customers_list - The array of customers used in the ratings matrix
    
    products_list - The array of products used in the ratings matrix
    
    item_lookup - A simple pandas dataframe of the unique product ID/product descriptions available
    
    returns:
    
    A list of item IDs and item descriptions for a particular customer that were already purchased in the training set
    '''
    cust_ind = np.where(customers_list == customer_id)[0][0] # Returns the index row of our customer id
    purchased_ind = mf_train[cust_ind,:].nonzero()[1] # Get column indices of purchased items
    prod_codes = products_list[purchased_ind] # Get the stock codes for our purchased items
#     return prod_codes
    return item_lookup.loc[item_lookup.JobID.isin(prod_codes)]

In [32]:

users_arr[:5]

array([ 7,  9, 14, 16, 18])

In [33]:

job_lookup.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1091923 entries, 0 to 1091922
Data columns (total 2 columns):
JobID    1091923 non-null int64
Title    1091923 non-null object
dtypes: int64(1), object(1)
memory usage: 25.0+ MB

In [34]:

get_items_purchased(14, product_train, users_arr, jobs_arr, job_lookup)

	JobID	Title
605671	206046	Materials Manager / Director
626723	372423	Sales Manager
652060	574999	Mortgage Loan Specialist
663486	663552	Mortgage Professionals - All Levels
678829	787741	Planner Specialist
702882	978868	Branch Coordinator

In [35]:

from sklearn.preprocessing import MinMaxScaler

In [36]:

def rec_items(customer_id, mf_train, user_vecs, item_vecs, customer_list, item_list, item_lookup, num_items = 10):
    '''
    This function will return the top recommended items to our users 
    
    parameters:
    
    customer_id - Input the customer's id number that you want to get recommendations for
    
    mf_train - The training matrix you used for matrix factorization fitting
    
    user_vecs - the user vectors from your fitted matrix factorization
    
    item_vecs - the item vectors from your fitted matrix factorization
    
    customer_list - an array of the customer's ID numbers that make up the rows of your ratings matrix 
                    (in order of matrix)
    
    item_list - an array of the products that make up the columns of your ratings matrix
                    (in order of matrix)
    
    item_lookup - A simple pandas dataframe of the unique product ID/product descriptions available
    
    num_items - The number of items you want to recommend in order of best recommendations. Default is 10. 
    
    returns:
    
    - The top n recommendations chosen based on the user/item vectors for items never interacted with/purchased
    '''
    
    cust_ind = np.where(customer_list == customer_id)[0][0] # Returns the index row of our customer id
    pref_vec = mf_train[cust_ind,:].toarray() # Get the ratings from the training set ratings matrix
    pref_vec = pref_vec.reshape(-1) + 1 # Add 1 to everything, so that items not purchased yet become equal to 1
    pref_vec[pref_vec > 1] = 0 # Make everything already purchased zero
    rec_vector = user_vecs[cust_ind,:].dot(item_vecs.T) # Get dot product of user vector and all item vectors
    # Scale this recommendation vector between 0 and 1
    min_max = MinMaxScaler()
    rec_vector_scaled = min_max.fit_transform(rec_vector.reshape(-1,1))[:,0] 
    recommend_vector = pref_vec*rec_vector_scaled 
    # Items already purchased have their recommendation multiplied by zero
    product_idx = np.argsort(recommend_vector)[::-1][:num_items] # Sort the indices of the items into order 
    # of best recommendations
    rec_list = [] # start empty list to store items
    for index in product_idx:
        code = item_list[index]
        rec_list.append([code, item_lookup.Title.loc[item_lookup.JobID == code].iloc[0]]) 
        # Append our descriptions to the list
    codes = [item[0] for item in rec_list]
    descriptions = [item[1] for item in rec_list]
    final_frame = pd.DataFrame({'JobId': codes, 'Title': descriptions}) # Create a dataframe 
    return final_frame[['JobId', 'Title']] # Switch order of columns around

In [37]:

get_items_purchased(7, product_train, users_arr, jobs_arr, job_lookup)

	JobID	Title
479106	309823	Teller
527966	703889	Customer Service Representative, Now Accepting...

In [38]:

rec_items(7, product_train, user_vecs, item_vecs, users_arr, jobs_arr, job_lookup,
                       num_items = 10)

	JobId	Title
0	1101648	Customer Service/Receptionist/Front Desk
1	12561	Dental Hygienist - up to $40/ hour!
2	802205	Customer Service Representative
3	344145	Customer Service Representative
4	1050711	Customer Service Representative
5	601021	Customer Service/Receptionist/Call Center Rep
6	601126	Customer Service/Front Desk Help/Receptionist
7	394969	Call Center Customer Service Collections / Credit
8	287182	Administrative Assistant
9	1046722	Data Entry Associate

Second Implementation using implicit library.

In [39]:

from implicit.als import AlternatingLeastSquares

In [40]:

# train model
model = AlternatingLeastSquares(factors=50,
                                regularization=0.01,
                                dtype=np.float64,
                                iterations=50)

confidence = 40
model.fit(confidence * apps_sparse)

100%|██████████| 50.0/50 [00:51<00:00,  1.07it/s]

In [41]:

# recommend items for a user
user_items = apps_sparse.T.tocsr()
recommendations = model.recommend(7, user_items)

# find related items
# related = model.similar_items(itemid)

In [42]:

recommendations

[(57371, 0.029422828555080816),
 (95894, 0.028675761246652852),
 (168777, 0.026280032617673803),
 (186945, 0.025853081049481193),
 (6434, 0.022265122614712515),
 (215515, 0.02175052321349273),
 (286526, 0.02006039079638639),
 (188385, 0.019824223520708895),
 (306468, 0.019819373496054445),
 (122115, 0.019597172209618055)]

In [43]:

recommend=[]
for i in range(len(recommendations)):
    recommend.append(recommendations[i][0])

In [44]:

rec_items(7, product_train, user_vecs, item_vecs, users_arr, jobs_arr, job_lookup,
                       num_items = 10)

	JobId	Title
0	1101648	Customer Service/Receptionist/Front Desk
1	12561	Dental Hygienist - up to $40/ hour!
2	802205	Customer Service Representative
3	344145	Customer Service Representative
4	1050711	Customer Service Representative
5	601021	Customer Service/Receptionist/Call Center Rep
6	601126	Customer Service/Front Desk Help/Receptionist
7	394969	Call Center Customer Service Collections / Credit
8	287182	Administrative Assistant
9	1046722	Data Entry Associate

In [45]:

job_lookup.loc[job_lookup.JobID.isin(recommend)]

	JobID	Title
14973	57371	Termite Inspector (147-480)
72738	286526	Customer Service Representative/E-Marketing Co...
308148	168777	A/R Medical Biller
310916	186945	Wireless Project Manager
441140	6434	CNC Machinist
451716	95894	Part-Time Customer Service Representative
466763	215515	Product Specialist!!! - Excellent Opportunity!
594338	122115	Maintenance Technician
893325	306468	Data / Business Analyst – Deliver Relevance th...
996719	188385	Assistant Store Manager - Casual Male XL

In [46]:

get_items_purchased(7, product_train, users_arr, jobs_arr, job_lookup)

	JobID	Title
479106	309823	Teller
527966	703889	Customer Service Representative, Now Accepting...

References

Jupyter Notebook

Share on

Twitter Facebook LinkedIn

Xun Wei