i am trying to use spacy and read a CV file to match the text with a set of skills in a dictionary i made, the function check_all_majors returns dictionary with skills that are related to one major only instead of the skills related to all majors it has found in the cv
nlp = spacy.load('en_core_web_sm')
skill_dict = {
'Computer Science': {'Python', 'Java', 'C++', 'machine learning', 'data structures', 'algorithms'},
'Electrical Engineering': {'circuit design', 'power systems', 'analog electronics', 'digital signal processing'},
'Mechanical Engineering': {'CAD', 'mechanical design', 'materials science', 'thermodynamics'},
'Statistics': {'metrices', 'statistic', 'algorithm', 'mathmatics'}
}
def tokenize_cv(cv_file):
file_extension = cv_file.name.split('.')[-1]
if file_extension == 'docx':
cv_text = docx2txt.process(cv_file)
elif file_extension == 'txt':
cv_text = cv_file.read().decode('utf-8')
elif file_extension == 'rtf':
cv_text = textract.process(cv_file).decode('utf-8')
else:
raise ValueError('Unsupported file type')
print('CV text:', cv_text)
doc = nlp(cv_text)
print('Spacy tokens:', [token.text for token in doc])
tokens = [token.text.lower() for token in doc if not token.is_stop and not token.is_punct]
print('Filtered tokens:', tokens)
return tokens
def check_all_majors(tokens):
matches = {}
for major, skills in skill_dict.items():
intersect = skills.intersection(set(tokens))
if intersect:
matches[major] = list(intersect)
return matches
@login_required(login_url='login')
def editAccount(request):
if request.user.is_Seeker:
seeker = request.user.seeker
form = SeekerAccountForm(instance=seeker)
elif request.user.is_Recruiter:
recruiter = request.user.recruiter
form = RecruiterAccountForm(instance=recruiter)
AllSkills = []
if request.method == 'POST':
if request.user.is_Seeker:
form = SeekerAccountForm(request.POST, request.FILES, instance=seeker)
if form.is_valid():
# Validate file extension
file = request.FILES['cv']
try:
# Validate the file extension
validate_word_or_text_file(file)
except ValidationError as e:
form.add_error('cv', e)
messages.error(request, 'the cv format is not accepted, Try (.docx , .txt , .rtf)')
return render(request, 'account-edit.html', {'form': form})
file = request.FILES.get('cv', None)
if file:
tokens = tokenize_cv(file)
print('tokens:' , tokens)
matches = check_all_majors(tokens)
for major, skills in matches.items():
for skill in skills:
skill_obj = Skill(owner=seeker, category=major, name=skill)
skill_obj.save()
print(f'Saved skill {skill_obj.name} in category {skill_obj.category}')
AllSkills.append(skill_obj)
form.save()
messages.success(request, 'Your account has been updated!')
return redirect('account')
elif request.user.is_Recruiter:
form = RecruiterAccountForm(request.POST, request.FILES, instance=recruiter)
if form.is_valid():
form.save()
messages.success(request, 'Your account has been updated!')
return redirect('account')
context = {'form': form}
if request.user.is_Seeker and seeker is not None:
context['cv_skills'] = AllSkills
return render(request, 'account-edit.html', context)