First attempt at tokenization using nltk's RegexpTokenizer for an assignment (necessary). Not sure if I should remove brackets?
You are required to extract the token and append them into the list 'token'
...not sure if I even did this right.
import re
import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.probability import *
from itertools import chain
from tqdm import tqdm
import codecs
from nltk.corpus import stopwords
nltk.download('stopwords')
df_text = pd.read_csv(r"C:\Users\User\Downloads\JobPostings.csv")
lower = []
for item in df_text['job_description']:
lower = [item]
lower.append(item.lower())
tokenizer_test = RegexpTokenizer(r"\s+", gaps=True)
tokens_test = tokenizer_test.tokenize(item)
token = [tokens_test]
print(token)
Output is:
[['Data', 'Scientist,', '(Staff', 'or', 'Principal)', 'at', 'realtor.com', '(View', 'all', 'jobs)', 'Santa', 'Clara,', 'CA', 'At', 'realtor.com,', 'we', 'process', 'terabytes', 'of', 'data', 'every', 'day', 'and', 'transform', 'that', 'data', 'into', 'information', 'that', 'powers', 'decisions', 'for', 'millions', 'of', 'homebuyers,', 'renters,', 'dreamers,', 'and', 'real', 'estate', 'professionals.', 'We', 'aim', 'to', 'radically', 'simplify', 'home', 'buying/selling', 'and', 'help', 'more', 'people', 'achieve', 'the', 'American', 'dream', 'on', 'our', 'realtor.com', 'website', 'and', 'mobile', 'apps.', 'We', 'seek', 'a', 'highly', 'seasoned', 'Data', 'Scientist', 'to', 'join', 'our', 'data', 'science', 'program', 'and', 'help', 'develop', 'it', 'to', 'its', 'full', 'potential.', 'As', 'a', 'key', 'member', 'of', 'the', 'data', 'science', 'team,', 'you', 'will', 'be', 'responsible', 'for', 'the', 'development', 'of', 'innovative', 'concepts,', 'research,', 'predictive', 'modeling,', 'and', 'machine', 'learning', 'algorithms.', 'Responsibilities:', 'Perform', 'exploratory', 'analysis', 'on', "realtor.com's", 'wealth', 'of', 'data', 'including', 'consumer', 'web', 'and', 'mobile', 'behavior', 'and', 'North', 'America’s', 'most', 'comprehensive', 'and', 'up-to-date', 'listings', 'and', 'properties', 'data', 'set.', 'Effectively', 'partner', 'with', 'product', 'and', 'engineering', 'teams', 'to', 'build', 'new', 'data-driven', 'and', 'machine', 'learning-based', 'features', 'in', 'our', 'professional', 'software', 'and', 'lead', 'monetization', 'products', 'to', 'enable', 'real', 'state', 'professionals', 'to', 'be', 'more', 'productive', 'and', 'effective', 'in', 'serving', 'the', 'needs', 'of', 'home', 'shoppers.', 'Help', 'improve', 'the', 'scope', 'our', 'data', 'sets', 'by', 'identifying', 'new', 'data', 'collection', 'and', 'procurement', 'opportunities', 'on', 'an', 'ongoing', 'basis', 'Drive', 'A/B,', 'multivariate', 'tests', 'and', 'design', 'of', 'experiments', 'to', 'facilitate', 'testing', 'of', 'new', 'product', 'and', 'design', 'features,', 'with', 'a', 'focus', 'on', 'improving', 'engagement,', 'retention,', 'and', 'conversion.', 'Select,', 'apply,', 'and', 'tune', 'a', 'diverse', 'set', 'of', 'tools', 'to', 'coherently', 'solve', 'challenging', 'business', 'goals', 'Create', 'automated', 'learning', 'systems', 'that', 'gracefully', 'scale', 'to', 'increasing', 'complexity', 'and', 'expectation', 'Develop', 'predictive,', 'explanatory', 'models', 'and', 'machine', 'learning', 'algorithms', 'Generate', 'descriptive', 'visualizations', 'and', 'presentations', 'to', 'communicate', 'insights', 'Mentor', 'a', 'team', 'of', 'data', 'scientists', 'on', 'data', 'exploration,', 'machine', 'learning', 'and', 'developing', 'data-based', 'products', 'Work', 'with', 'a', 'sense', 'of', 'ownership', 'and', 'urgency,', 'advocate', 'for', 'experimentation', 'based,', 'agile', 'culture.', 'Requirements:', 'MS', 'or', 'Ph.D.', 'in', 'statistics,', 'mathematics,', 'operations', 'research,', 'computer', 'science,', 'quantitative', 'analysis,', 'economics', 'or', 'related', 'field', 'is', 'required.', '7+', 'years', 'of', 'relevant', 'experience', 'in', 'data', 'science,', 'data', 'analytics,', 'or', 'applied', 'statistics,', 'Experience', 'with', 'machine', 'learning,', 'NLP,', 'data', 'mining,', 'statistical', 'modeling', 'tools,', 'and', 'underlying', 'algorithms', 'Experienced', 'in', 'R,', 'Perl,', 'Python,', 'Spark,', 'or', 'other', 'languages', 'and', 'frameworks', 'appropriate', 'for', 'large', 'scale', 'analysis', 'of', 'numerical,', 'textual,', 'image,', 'and', 'video', 'data', 'Strong', 'skills', 'in', 'data', 'gathering,', 'massaging', 'and', 'featurization', 'Working', 'experience', 'with', 'relational', 'databases', 'and', 'SQL', 'Experience', 'with', 'experiment', 'design', 'and', 'A/B', 'and', 'multivariate', 'tests', 'Experience', 'and', 'proven', 'track', 'record', 'developing', 'online', 'data', 'products', 'Strong', 'creative', 'thinking', 'and', 'problem-solving', 'skills', 'Excellent', 'oral', 'and', 'written', 'communication', 'and', 'presentation', 'skills']]
edit: tried this out instead... thoughts?
df_text_jd = df_text.job_description
lower = []
for item in df_text_jd:
lower.append(item.lower().replace('(','').replace(')',''))
l = []
for token in item:
tokenizer_test = RegexpTokenizer(r'\s+', gaps=True)
token = tokenizer_test.tokenize(item)
l.append(token)
l
You can replace the brackets by modifying the line where you append the lower case item to lower list:
lower.append(item.lower().replace('(','').replace(')',''))