from google.colab import drive
'/content/drive') drive.mount(
Mounted at /content/drive
August 30, 2021
Taxonomy of recommender systems: A framework to classify and analyze a particular recommendation system.The system is described by the following dimensions:
1. domain
2. purpose
3. context
4. personalize level
5. whose opinions
6. privacy and trustworthiness
7. interfaces
8. algorithms
Domain: Type of content recommended. The domain of Netflix is movies and TV series.
Purpose: What is the purpose of the system, both for the end user and for the provider?
Context: The environment which the consumer recieves a recommendation.
Personalization Levels: 1. None personalized 2. Semi/Segment personalized 3. Personalized recommendation is based on data about the current user than indicates how the user has interacted with the system previously.
Algorithms: Content-based filtering uses the metadata having on the items in the catalog. Depending on the specific algorithm, the system can calculate recommendations either by taking the items the user has liked and finding similar content, by comparing the items and user profiles, or if there’s no user involved, by finding similar content between items.
/usr/local/lib/python3.7/dist-packages/ipykernel_launcher.py:1: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
"""Entry point for launching an IPython kernel.
import re
def add_spaces(text):
# Space after punc. Only apply to !.,;:?
# Eg: Dislike.However => Dislike. However
text = re.compile(r"([!.,;:?])([A-Z])").sub(r"\1 \2", text)
# Space before open bracket
# Eg: Dislike(sth) => Dislike (sth)
text = re.sub(r"([A-za-z])([\(\{\[])", r'\1 \2', text)
# Space after close bracket
# Eg: (such as)I like => (such as) I like
text = re.sub(r"([\)\}\]])([A-Za-z])", r'\1 \2', text)
# Space between word and - or +
# Eg: I like it because-fast -pretty => I like it because - fast - pretty
# Eg: I like mac-book => keep the same
text = re.sub(r"([A-Za-z])([-+])(\s)", r'\1 \2 ', text)
return text
/usr/local/lib/python3.7/dist-packages/ipykernel_launcher.py:1: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
"""Entry point for launching an IPython kernel.
temp_df = df[df.no_html != df.add_space][["no_html", "add_space"]]
print("Impacted row", len(temp_df))
temp_df.sample()
Impacted row 2
no_html | add_space | |
---|---|---|
53 | Experience in Agile. development methodologies. Integration of user-facing elements developed by a back-end developer with server- side logic | Experience in Agile. development methodologies. Integration of user-facing elements developed by a back-end developer with server - side logic |
import string
def handle_listing_number(text):
"""
Only remove if it is listing numbers such as 1. 2. 3.
"""
listing = []
def remove_listing_number(match_obj):
listing_number = match_obj.group(0).strip()
result = ""
if match_obj.start() == 0 and listing_number[0] == "1":
return ""
for c in listing_number:
if c.isdigit():
# save to listing if it is digit
listing.append(int(c))
break
result += c
if not any(c in string.punctuation for c in result):
result += "."
return result + " "
new_text = re.sub(r"\b([\.\;\:\!\?\D]*)([1-9])\.\s", remove_listing_number, text.strip())
if len(listing) > 1:
listing_copy = listing[:]
listing_copy.sort()
# if listing is sorted
if listing_copy == listing:
return new_text
return text
/usr/local/lib/python3.7/dist-packages/ipykernel_launcher.py:1: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
"""Entry point for launching an IPython kernel.
from symspellpy import SymSpell, Verbosity, helpers
import contractions
def expand_contractions(text):
"""
expand shortened words, e.g. don't to do not
contractions library does not keep character case => need to transfer casing from origin text to fixed text
"""
expanded_text = helpers.transfer_casing_for_similar_text(text, contractions.fix(text))
# uppercase I
return re.sub(r"\bi\b", "I", expanded_text)
/usr/local/lib/python3.7/dist-packages/ipykernel_launcher.py:1: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
"""Entry point for launching an IPython kernel.
# Remove and replace by empty space
def remove_redundant_elements(text):
# remove urls
text = re.sub(r"http\S+", " ", text)
# remove phone
text = re.sub(r"[\+]?[(]?[0-9]{3}[)]?[-\s\.]?[0-9]{3}[-\s\.]?[0-9]{4,6}", " ", text)
# remove email
text = re.sub(r"[\w.+-]+@[\w-]+\.[\w.-]+", " ", text)
# remove newline
table = str.maketrans("\n\t\r", " ")
text = text.translate(table)
# remove redundant whitespaces
text = " ".join(text.split())
return text
/usr/local/lib/python3.7/dist-packages/ipykernel_launcher.py:1: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
"""Entry point for launching an IPython kernel.
/usr/local/lib/python3.7/dist-packages/ipykernel_launcher.py:1: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
"""Entry point for launching an IPython kernel.
/usr/local/lib/python3.7/dist-packages/ipykernel_launcher.py:1: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
"""Entry point for launching an IPython kernel.
from itertools import groupby
def collapse_duplicated_punctuations(text):
"""
collapse duplicated punctations
because we added space to separate punc and word in step 3, no need to append " " after punc
"""
newtext = []
for k, g in groupby(text):
if k in string.punctuation:
newtext.append(k)
else:
newtext.extend(g)
return ''.join(newtext)
/usr/local/lib/python3.7/dist-packages/ipykernel_launcher.py:1: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
"""Entry point for launching an IPython kernel.
/usr/local/lib/python3.7/dist-packages/ipykernel_launcher.py:1: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
"""Entry point for launching an IPython kernel.
/usr/local/lib/python3.7/dist-packages/ipykernel_launcher.py:2: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
/usr/local/lib/python3.7/dist-packages/ipykernel_launcher.py:1: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
"""Entry point for launching an IPython kernel.
/usr/local/lib/python3.7/dist-packages/ipykernel_launcher.py:1: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
"""Entry point for launching an IPython kernel.
We will develop the recommender, which is used to suggest jobs based on learned courses.
To recommend the jobs based on courses we leverage 2 models via 2 steps:
Step 1: Firstly, we use sentence transformer model to calculate the vector embeddings for chosen courses and jobs. Then we compare the the similarity score (cosine score) between them and filter the top 10.
Step 2: Secondly, we use the NER model to extract the skills from chosen courses and courses. Then we calculate the score for each job by counting the matching skills with chosen courses. That score will be used to sort the jobs to produce the final order.
import torch
import ast
from collections import defaultdict
def add_space(ent):
ent['word'] = ent['word'].replace('Ġ', ' ')
return ent
def merge_B_I_entities(ents):
results = []
i = 0
N = len(ents)
while i < N:
ent = ents[i]
ent = add_space(ent)
if i < N - 1 and ent['entity'][:2] == 'B-':
i += 1
next_ent = ents[i]
while i < N and next_ent['entity'][:2] == 'I-':
ent['word'] += add_space(next_ent)['word']
i += 1
if i < N:
next_ent = ents[i]
else:
break
i -= 1
ent['end'] = ents[i]['end']
ent['word'] = ent['word'].strip().lower()
results.append(ent)
i += 1
return results
def merge_entity(ent1, ent2):
if ent1['end'] == ent2['start']:
ent = {'start': ent1['start'], 'end': ent2['end'], 'entity': ent1['entity'], 'word': (ent1['word']+ent2['word']).strip().lower()}
return ent
def merge_similar_entities(ents):
results = []
hash_map = defaultdict(list)
for ent in ents:
hash_map[ent['entity']].append(ent)
for k, v in hash_map.items():
new_ents = []
merge_ent = v[0]
i = 0
while i < len(v) - 1:
temp = merge_entity(merge_ent, v[i + 1])
if temp:
merge_ent = temp
else:
new_ents.append(merge_ent)
merge_ent = v[i + 1]
i+=1
merge_ent['word'] = merge_ent['word'].strip().lower()
new_ents.append(merge_ent)
results += new_ents
words = [ent['word'] for ent in results]
return results
def extract_skills(desc):
ents = classifier(desc)
results = merge_B_I_entities(ents)
results = merge_similar_entities(results)
skills = set()
for ent in results:
skills.add(ent['word'])
return list(skills)
def get_skills_from_course_titles(course_titles):
course_skills = set()
for skills in course_df[course_df['title'].isin(course_titles)]['skills']:
course_skills.update(ast.literal_eval(skills))
return course_skills
def show_results(results):
for res in results:
print(res)
from collections import defaultdict
from dataclasses import dataclass
from typing import List
import ast
import torch
def get_skills_from_course_titles(course_titles):
course_skills = set()
for skills in course_df[course_df['title'].isin(course_titles)]['skills']:
course_skills.update(ast.literal_eval(skills))
return course_skills
class Recommender:
def __init__(self, ner_classifier, sent_model, course_df, setup=True):
self.classifier = ner_classifier
self.sent_model = sent_model
self.course_info = course_df
if setup:
self._setup(course_df)
def _setup(self, course_df):
self.course_info = course_df.copy(deep=True)
self.course_info['skills'] = self.course_info['extract_skills'].apply(self.extract_skills)
def extract_skills(self, desc):
ents = self.classifier(desc)
results = Recommender.merge_B_I_entities(ents)
results = Recommender.merge_similar_entities(results)
skills = set()
for ent in results:
skills.add(ent['word'])
return list(skills)
def recommend(self, course_titles, job_info, topk: int = None) -> List[Job]:
c_embed = self.sent_model.encode(list(self.course_info[self.course_info['title'].isin(course_titles)]['no_sw']), convert_to_tensor=True)
c_avg_embed = torch.mean(c_embed, axis=0)
job_descs = list(job_info['no_sw'])
job_embeds = self.sent_model.encode(job_descs, convert_to_tensor=True)
results = Recommender.compare_embeds(c_avg_embed, job_embeds, job_info)
skill_list = [set(res[2]) for res in results]
my_skills = Recommender.get_skills_from_course_titles(course_titles)
scores = []
# calculate similar skills scores
for skills in skill_list:
mutual = my_skills.intersection(skills)
if len(my_skills) != 0:
scores.append(len(mutual)/len(my_skills))
else:
scores.append(0)
for i, res in enumerate(results):
final_score = res[-1] * 0.6 + 0.4 * scores[i]
res[-1] = final_score
results.sort(key=lambda x: x[-1], reverse=True)
if topk:
results = results[:topk]
return results
@staticmethod
def get_skills_from_course_titles(course_titles):
course_skills = set()
for skills in course_df[course_df['title'].isin(course_titles)]['skills']:
course_skills.update(ast.literal_eval(skills))
return course_skills
@staticmethod
def add_space(ent):
ent['word'] = ent['word'].replace('Ġ', ' ')
return ent
@staticmethod
def merge_B_I_entities(ents):
results = []
i = 0
N = len(ents)
while i < N:
ent = ents[i]
ent = add_space(ent)
if i < N - 1 and ent['entity'][:2] == 'B-':
i += 1
next_ent = ents[i]
while i < N and next_ent['entity'][:2] == 'I-':
ent['word'] += add_space(next_ent)['word']
i += 1
if i < N:
next_ent = ents[i]
else:
break
i -= 1
ent['end'] = ents[i]['end']
ent['word'] = ent['word'].strip().lower()
results.append(ent)
i += 1
return results
@staticmethod
def merge_entity(ent1, ent2):
if ent1['end'] == ent2['start']:
ent = {'start': ent1['start'], 'end': ent2['end'], 'entity': ent1['entity'], 'word': (ent1['word']+ent2['word']).strip().lower()}
return ent
@staticmethod
def merge_similar_entities(ents):
results = []
hash_map = defaultdict(list)
for ent in ents:
hash_map[ent['entity']].append(ent)
for k, v in hash_map.items():
new_ents = []
merge_ent = v[0]
i = 0
while i < len(v) - 1:
temp = Recommender.merge_entity(merge_ent, v[i + 1])
if temp:
merge_ent = temp
else:
new_ents.append(merge_ent)
merge_ent = v[i + 1]
i+=1
merge_ent['word'] = merge_ent['word'].strip().lower()
new_ents.append(merge_ent)
results += new_ents
words = [ent['word'] for ent in results]
return results
@staticmethod
def compare_embeds(c_embed: List, job_embeds: List[List], job_info):
results = []
for i, job_embed in enumerate(job_embeds):
score = util.pytorch_cos_sim(c_embed, job_embed)
results.append([job_info.index[i], job_info['title'].iloc[i], job_info['skills'].iloc[i], score.item()])
results.sort(key=lambda x: x[-1], reverse=True)
return results
@staticmethod
def post_process(self, outputs):
return outputs
[77, 'Software Engineer in Data Science', "['google deepmind', 'autopilot', 'autopilot ai', 'vinai', 'python', 'v', 'gpus', 'ai tooling.', 'dashboards', 'numpy', 'vinai autopilot', 'pandas', 'h', 'vin b']", 0.34073367118835446]
[74, 'All-round Engineer', "['ue', 'laravel engine', 'agile', 'js', 'search engine', 'php', 'python', '4', 'batch']", 0.32163201570510863]
[92, 'Software Engineer, Observability', "['react', 'grab', 'elk', 'jaeger', 'python', 'nodejs', 'prometheus', 'amazon web services', 'observability', 'go', 'java', 'aws', 'unified observability', 'reduxflow', 'angular', 'golang']", 0.3089586853981018]
[127, 'AI Engineer, Kobiton', "['js', 'backend', 'mobiles', 'vietnam', 'java', 'mobile', 'desktops', 'postgredb', 'nguyen', 'ptim', 'k8s', 'golang', 'ghn express', 'docker', 'tablets', 'node', '0906738', 'mob', 'restful api', 'back', 'telegram', 'mongodb', 'p']", 0.30338029861450194]
[138, 'AI Engineer, Kobiton', "['rbvh', 'hcmc', 'c++', 'c', 'sap', 'python union', 'robert bosch gmbh', 'jav', 'robert bosch', 'matlab', 'microcontroller']", 0.30338029861450194]
[129, 'Fresh Software Engineer', "['orm', 'win', 'client', 'kms', 'c#', 'javascript', '19', '.net', 'wp', 'upstar labs', 'scrum', 'mysql', 'entity', 'cov', 'sql', 'asp.net mvc', 'nhibernate', 'microsoft sql server', 'qasymph', 'winform', 'web api']", 0.26918667554855347]
[89, 'Bridge Software Engineer (BSE)', "['c++', 'ho', 'c', '.net', 'project manager', 'java', 'ha']", 0.2661460340023041]
[79, 'Full Stack Software Engineer (.NET, C#)', "['reactjs', 'html', 'asp.net core', 'vs', '8:00 âģĵ 10: 00 am', 'azure devops', 'c#', 'javascript', 'git', 'hub', 'sql', '.net', 'dan', 'agile', 'angular']", 0.26532386541366576]
[0, 'Fresher Python Software Engineer', "['client', 'kms', '19', 'cov-', 'k', 'upstar labs']", 0.2651121497154236]
[113, '[ECM] Test Engineer', "['vuejs', 'loopback', 'react', 'ui-ux', 'kms', 'express', 'nodejs', 'kobiton', 'tricentis', 'angularjs', '19emic', 'katalon', 'k', '($100m', 'mobile', 'covid', 'koa', 'qasymphony']", 0.26384042501449584]
[92, 'Software Engineer, Observability', "['react', 'grab', 'elk', 'jaeger', 'python', 'nodejs', 'prometheus', 'amazon web services', 'observability', 'go', 'java', 'aws', 'unified observability', 'reduxflow', 'angular', 'golang']", 0.29281051754951476]
[78, 'SOFTWARE ENGINEER (HCM)', "['$700', 'devops', 'data engineer', '15000000.', 'vue.js', 'relational', 'java', '. 40000000', 'c¦ung', 'data analyst', 'angular.js', 'ch', 'v', '24-09-2021', 'yãĭu', 'sql', 'jvm', '$2000', 'python', 'react.js', 'scala']", 0.2801404237747192]
[36, 'System/Network Engineer_Freshers', "['hypervisors', '3 switch', 'shell', 'jun', 'super computer', 'vmware', 'qemui', 'vlan', 'linux', 'kvm', 'hyper-v', 'l', 'hpc']", 0.2568916082382202]
[113, '[ECM] Test Engineer', "['vuejs', 'loopback', 'react', 'ui-ux', 'kms', 'express', 'nodejs', 'kobiton', 'tricentis', 'angularjs', '19emic', 'katalon', 'k', '($100m', 'mobile', 'covid', 'koa', 'qasymphony']", 0.2493357181549072]
[77, 'Software Engineer in Data Science', "['google deepmind', 'autopilot', 'autopilot ai', 'vinai', 'python', 'v', 'gpus', 'ai tooling.', 'dashboards', 'numpy', 'vinai autopilot', 'pandas', 'h', 'vin b']", 0.24794490337371825]
[103, 'Software Engineer', "['5', 'html', 'client', 'cocos creator', 'cocos', 'javascript', 'css', '2dx', 'restful api', 'unity', 'mobile', 'type']", 0.24549003839492797]
[47, 'Fullstack Software Engineer (ReactJS/NodeJS/Go)', "['n', 'svn', 'webpack', 'javascript', 'git', 'es', '6', 'bower', 'reactjs', 'golang', 'babel', 'chotot', 'docker', 'web browser', 'nodejs', 'mac', 'json', 'grunt', 'full stack']", 0.2437905728816986]
[18, 'Software Engineer (Golang/Java)', "['server', 'backend', 'tidb', 'java', 'container', 'javascript', 'rdbms', 'git', 'css', 'angularjs', 'reactjs', 'golang', 'html', 'docker', 'mysql', 'redis', 'kafka', 'nosql', 'zalopay', 'frontend', 'jquery']", 0.24373272657394407]
[34, 'Automation Test Engineer', "['mocha', 'kms', 'appium', 'testng', 'kms labsasymph', 'nightwatch', 'jasmine', 'scrum', 'selenium']", 0.23755048513412474]
[104, 'Software Engineer', "['microsoft .net', 'sql server']", 0.23728473186492918]